11#Github项目分析一
22
3- #用matplotlib生成图表
3+ ## 用matplotlib生成图表
44
55如何分析用户的数据是一个有趣的问题,特别是当我们有大量的数据的时候。
66除了`` matlab `` ,我们还可以用`` numpy `` +`` matplotlib ``
77
8- ##python github用户数据分析##
8+ ### python github用户数据分析##
99
1010数据可以在这边寻找到
1111
1212[ https://github.com/gmszone/ml ] ( https://github.com/gmszone/ml )
1313
1414最后效果图
15- <img src =" https://raw.githubusercontent.com/gmszone/ml/master/screenshots/2014-01-01.png " width =600 >
15+
16+ ![ 2014 01 01] ( ./img/2014-01-01.png )
1617
1718要解析的json文件位于`` data/2014-01-01-0.json `` ,大小6.6M,显然我们可能需要用每次只读一行的策略,这足以解释为什么诸如sublime打开的时候很慢,而现在我们只需要里面的json数据中的创建时间。。
1819
19- ==
20- 这个文件代表什么?
20+ ==这个文件代表什么?
2121
2222** 2014年1月1日零时到一时,用户在github上的操作,这里的用户指的是很多。。一共有4814条数据,从commit、create到issues都有。**
2323
24- ##python json文件解析##
24+ ### python json文件解析##
2525
26- import json
27- for line in open(jsonfile):
28- line = f.readline()
26+ ``` python
27+ import json
28+ for line in open (jsonfile):
29+ line = f.readline()
30+ ```
2931
3032然后再解析json
31- <pre ><code class =" python " >
33+
34+ ``` python
3235import dateutil.parser
3336
3437lin = json.loads(line)
3538date = dateutil.parser.parse(lin[" created_at" ])
36- </code ></pre >
39+ ```
40+
3741这里用到了`` dateutil `` ,因为新鲜出炉的数据是string需要转换为`` dateutil `` ,再到数据放到数组里头。最后有就有了`` parse_data ``
3842
43+ ``` python
3944def parse_data (jsonfile ):
4045 f = open (jsonfile, " r" )
4146 dataarray = []
@@ -51,21 +56,27 @@ def parse_data(jsonfile):
5156 minuteswithcount = [(x, dataarray.count(x)) for x in set (dataarray)]
5257 f.close()
5358 return minuteswithcount
54-
59+ ```
5560
5661下面这句代码就是将上面的解析为
5762
58- minuteswithcount = [(x, dataarray.count(x)) for x in set(dataarray)]
63+ ``` python
64+ minuteswithcount = [(x, dataarray.count(x)) for x in set (dataarray)]
65+ ```
5966
6067这样的数组以便于解析
6168
62- [(0, 92), (1, 67), (2, 86), (3, 73), (4, 76), (5, 67), (6, 61), (7, 71), (8, 62), (9, 71), (10, 70), (11, 79), (12, 62), (13, 67), (14, 76), (15, 67), (16, 74), (17, 48), (18, 78), (19, 73), (20, 89), (21, 62), (22, 74), (23, 61), (24, 71), (25, 49), (26, 59), (27, 59), (28, 58), (29, 74), (30, 69), (31, 59), (32, 89), (33, 67), (34, 66), (35, 77), (36, 64), (37, 71), (38, 75), (39, 66), (40, 62), (41, 77), (42, 82), (43, 95), (44, 77), (45, 65), (46, 59), (47, 60), (48, 54), (49, 66), (50, 74), (51, 61), (52, 71), (53, 90), (54, 64), (55, 67), (56, 67), (57, 55), (58, 68), (59, 91)]
69+ ``` python
70+ [(0 , 92 ), (1 , 67 ), (2 , 86 ), (3 , 73 ), (4 , 76 ), (5 , 67 ), (6 , 61 ), (7 , 71 ), (8 , 62 ), (9 , 71 ), (10 , 70 ), (11 , 79 ), (12 , 62 ), (13 , 67 ), (14 , 76 ), (15 , 67 ), (16 , 74 ), (17 , 48 ), (18 , 78 ), (19 , 73 ), (20 , 89 ), (21 , 62 ), (22 , 74 ), (23 , 61 ), (24 , 71 ), (25 , 49 ), (26 , 59 ), (27 , 59 ), (28 , 58 ), (29 , 74 ), (30 , 69 ), (31 , 59 ), (32 , 89 ), (33 , 67 ), (34 , 66 ), (35 , 77 ), (36 , 64 ), (37 , 71 ), (38 , 75 ), (39 , 66 ), (40 , 62 ), (41 , 77 ), (42 , 82 ), (43 , 95 ), (44 , 77 ), (45 , 65 ), (46 , 59 ), (47 , 60 ), (48 , 54 ), (49 , 66 ), (50 , 74 ), (51 , 61 ), (52 , 71 ), (53 , 90 ), (54 , 64 ), (55 , 67 ), (56 , 67 ), (57 , 55 ), (58 , 68 ), (59 , 91 )]
71+ ```
6372
64- ##matplotlib##
65- 开始之前需要安装``matplotlib
73+ ##matplotlib
6674
67- sudo pip install matplotlib
75+ 开始之前需要安装`` matplotlib
6876
77+ ``` bash
78+ sudo pip install matplotlib
79+ ```
6980然后引入这个库
7081
7182 import matplotlib.pyplot as plt
@@ -81,67 +92,68 @@ def parse_data(jsonfile):
8192
8293最后代码可见
8394
84- #!/usr/bin/env python
85- # -*- coding: utf-8 -*-
86-
87- import json
88- import dateutil.parser
89- import numpy as np
90- import matplotlib.mlab as mlab
91- import matplotlib.pyplot as plt
92-
93-
94- def parse_data(jsonfile):
95- f = open(jsonfile, "r")
96- dataarray = []
97- datacount = 0
98-
99- for line in open(jsonfile):
100- line = f.readline()
101- lin = json.loads(line)
102- date = dateutil.parser.parse(lin["created_at"])
103- datacount += 1
104- dataarray.append(date.minute)
105-
106- minuteswithcount = [(x, dataarray.count(x)) for x in set(dataarray)]
107- f.close()
108- return minuteswithcount
109-
110-
111- def draw_date(files):
112- x = []
113- y = []
114- mwcs = parse_data(files)
115- for mwc in mwcs:
116- x.append(mwc[0])
117- y.append(mwc[1])
118-
119- plt.figure(figsize=(8,4))
120- plt.plot(x, y,label = files)
121- plt.legend()
122- plt.show()
123-
124- draw_date("data/2014-01-01-0.json")
125-
126-
127- #每周分析
95+
96+ ``` python
97+ # !/usr/bin/env python
98+ # -*- coding: utf-8 -*-
99+
100+ import json
101+ import dateutil.parser
102+ import numpy as np
103+ import matplotlib.mlab as mlab
104+ import matplotlib.pyplot as plt
105+
106+
107+ def parse_data (jsonfile ):
108+ f = open (jsonfile, " r" )
109+ dataarray = []
110+ datacount = 0
111+
112+ for line in open (jsonfile):
113+ line = f.readline()
114+ lin = json.loads(line)
115+ date = dateutil.parser.parse(lin[" created_at" ])
116+ datacount += 1
117+ dataarray.append(date.minute)
118+
119+ minuteswithcount = [(x, dataarray.count(x)) for x in set (dataarray)]
120+ f.close()
121+ return minuteswithcount
122+
123+
124+ def draw_date (files ):
125+ x = []
126+ y = []
127+ mwcs = parse_data(files)
128+ for mwc in mwcs:
129+ x.append(mwc[0 ])
130+ y.append(mwc[1 ])
131+
132+ plt.figure(figsize = (8 ,4 ))
133+ plt.plot(x, y,label = files)
134+ plt.legend()
135+ plt.show()
136+
137+ draw_date(" data/2014-01-01-0.json" )
138+ ```
139+
140+ ##每周分析
128141
129142继上篇之后,我们就可以分析用户的每周提交情况,以得出用户的真正的工具效率,每个程序员的工作时间可能是不一样的,如
130- ![ Phodal Huang's Report] [ 1 ]
131143
132- [ 1 ] : https://www.phodal.com/static/media/uploads/screen_shot_2014-04-12_at_9.58.52_am.png
144+ ![ Phodal Huang's Report ] ( ./img/phodal-results )
133145
134146这是我的每周情况,显然如果把星期六移到前面的话,随着工作时间的增长,在github上的使用在下降,作为一个
135147
136148 a fulltime hacker who works best in the evening (around 8 pm).
137149
138150不过这个是osrc的分析结果。
139151
140- ##python github 每周情况分析##
152+ ### python github 每周情况分析
141153
142154看一张分析后的结果
143155
144- < img src = " https://raw.githubusercontent.com/gmszone/ml/master/screenshots/ feb-results.png" width = 600 >
156+ ![ Feb Results ] ( ./img/ feb-results.png)
145157
146158结果正好与我的情况相反?似乎图上是这么说的,但是数据上是这样的情况。
147159
@@ -174,67 +186,71 @@ def parse_data(jsonfile):
174186 8474, 7984, 12933, 13504, 13763, 13544, 12940,
175187 7119, 7346, 13412, 14008, 12555
176188
177- ##python 数据分析##
189+ ### python 数据分析
178190
179191重写了一个新的方法用于计算提交数,直至后面才意识到其实我们可以算行数就够了,但是方法上有点hack
180192
181- <pre ><code class =" python " >
182- def get_minutes_counts_with_id(jsonfile):
183- datacount, dataarray = handle_json(jsonfile)
184- minuteswithcount = [(x, dataarray.count(x)) for x in set(dataarray)]
185- return minuteswithcount
186-
187-
188- def handle_json(jsonfile):
189- f = open(jsonfile, "r")
190- dataarray = []
191- datacount = 0
192-
193- for line in open(jsonfile):
194- line = f.readline()
195- lin = json.loads(line)
196- date = dateutil.parser.parse(lin["created_at"])
197- datacount += 1
198- dataarray.append(date.minute)
199-
200- f.close()
201- return datacount, dataarray
202-
203-
204- def get_minutes_count_num(jsonfile):
205- datacount, dataarray = handle_json(jsonfile)
206- return datacount
207-
208-
209- def get_month_total():
210- """
211-
212- :rtype : object
213- """
214- monthdaycount = []
215- for i in range(1, 20):
216- if i < 10:
217- filename = 'data/2014-02-0' + i.__str__() + '-0.json'
218- else:
219- filename = 'data/2014-02-' + i.__str__() + '-0.json'
220- monthdaycount.append(get_minutes_count_num(filename))
221- return monthdaycount
222- </code ></pre >
193+ ``` python
194+ def get_minutes_counts_with_id (jsonfile ):
195+ datacount, dataarray = handle_json(jsonfile)
196+ minuteswithcount = [(x, dataarray.count(x)) for x in set (dataarray)]
197+ return minuteswithcount
198+
199+
200+ def handle_json (jsonfile ):
201+ f = open (jsonfile, " r" )
202+ dataarray = []
203+ datacount = 0
204+
205+ for line in open (jsonfile):
206+ line = f.readline()
207+ lin = json.loads(line)
208+ date = dateutil.parser.parse(lin[" created_at" ])
209+ datacount += 1
210+ dataarray.append(date.minute)
211+
212+ f.close()
213+ return datacount, dataarray
214+
215+
216+ def get_minutes_count_num (jsonfile ):
217+ datacount, dataarray = handle_json(jsonfile)
218+ return datacount
219+
220+
221+ def get_month_total ():
222+ """
223+
224+ :rtype : object
225+ """
226+ monthdaycount = []
227+ for i in range (1 , 20 ):
228+ if i < 10 :
229+ filename = ' data/2014-02-0' + i.__str__ () + ' -0.json'
230+ else :
231+ filename = ' data/2014-02-' + i.__str__ () + ' -0.json'
232+ monthdaycount.append(get_minutes_count_num(filename))
233+ return monthdaycount
234+ ```
235+
223236接着我们需要去遍历每个结果,后面的后面会发现这个效率真的是太低了,为什么木有多线程?
224237
225- ##python matplotlib图表##
238+ ###python matplotlib图表
239+
226240让我们的matplotlib来做这些图表的工作
227241
228- if __name__ == '__main__':
229- results = pd.get_month_total()
230- print results
231-
232- plt.figure(figsize=(8, 4))
233- plt.plot(results.__getslice__(0, 7), label="first week")
234- plt.plot(results.__getslice__(7, 14), label="second week")
235- plt.plot(results.__getslice__(14, 21), label="third week")
236- plt.legend()
237- plt.show()
242+ ``` python
243+ if __name__ == ' __main__' :
244+ results = pd.get_month_total()
245+ print results
246+
247+ plt.figure(figsize = (8 , 4 ))
248+ plt.plot(results.__getslice__ (0 , 7 ), label = " first week" )
249+ plt.plot(results.__getslice__ (7 , 14 ), label = " second week" )
250+ plt.plot(results.__getslice__ (14 , 21 ), label = " third week" )
251+ plt.legend()
252+ plt.show()
253+ ```
238254
239255蓝色的是第一周,绿色的是第二周,蓝色的是第三周就有了上面的结果。
240256
0 commit comments