2018-05-31

Python函数使用

enumerate()使用

如果对一个列表，既要遍历索引又要遍历元素时，首先可以这样写：

1
2
3

list1 = ["这", "是", "一个", "测试"] 
for i in range (len(list1)): 
        print i ,list1[i]

上述方法有些累赘，利用enumerate()会更加直接和优美：

list1 = ["这", "是", "一个", "测试"] 
for index, item in enumerate(list1): 
        print index, item 
>>> 
0 这 
1 是
2 一个 
3 测试

enumerate还可以接收第二个参数，用于指定索引起始值，如：

list1 = ["这", "是", "一个", "测试"] 
for index, item in enumerate(list1, 1): 
    print index, item 
>>>
1 这 
2 是 
3 一个 
4 测试

stack() 和 unstack()的用法

import numpy as np
import pandas as pd

data=pd.DataFrame(np.arange(6).reshape((3,2)),index=pd.Index(['a','b','c'],name='state'),columns=pd.Index(['I','II'],name='number'))
data
Out[627]: 
number  I  II
state        
a       0   1
b       2   3
c       4   5

result=data.unstack()
result
Out[628]: 
number  state
I       a        0
        b        2
        c        4
II      a        1
        b        3
        c        5

type(result) #pandas.core.series.Series。 DataFrame 被 stack成拥有二重MultiIndex的Series。

df=pd.DataFrame({'left':result,'right':result+5},columns=pd.Index(['left','right'],name='side'))
df
Out[630]: 
side          left  right
number state             
I      a         0      5
       b         2      7
       c         4      9
II     a         1      6
       b         3      8
       c         5     10

df.stack() 
Out[633]: 
number  state  side 
I       a      left      0
               right     5
        b      left      2
               right     7
        c      left      4
               right     9
II      a      left      1
               right     6
        b      left      3
               right     8
        c      left      5
               right    10

结果是一个三重index的series
如果想把side作为列变量，那就stack(‘side’)

df.stack().unstack('side') 
Out[636]: 
side          left  right
number state             
I      a         0      5
       b         2      7
       c         4      9
II     a         1      6
       b         3      8
       c         5     10

df.unstack(‘number’).stack(‘number’) #更改了multiindex的顺序。先把’number’作为列的index，再拿下来作为行的index，就更改了在index中出现的顺序。

Out[638]: 
side          left  right
state number             
a     I          0      5
      II         1      6
b     I          2      7
      II         3      8
c     I          4      9
      II         5     10

drop_duplicates() 函数去重

过滤

行过滤

1
2
3

例如，我们需要过滤掉appPlatform=2而且appID=278和appID=382的样本呢？非常简单。

data[(True-data['appID'].isin([278,382]))&(True-data['appPlatform'].isin([2]))]

过滤掉某个范围的值

1
2
3

对于数据集data，我们想过滤掉creativeID（第一列）中ID值大于10000的样本。

data[data['creativeID']<=10000]

浮点小数位数处理

round（）

>>> df = pd.DataFrame(np.random.random([3, 3]),
...     columns=['A', 'B', 'C'], index=['first', 'second', 'third'])
>>> df
               A         B         C
first   0.028208  0.992815  0.173891
second  0.038683  0.645646  0.577595
third   0.877076  0.149370  0.491027
>>> df.round(2)
           A     B     C
first   0.03  0.99  0.17
second  0.04  0.65  0.58
third   0.88  0.15  0.49
>>> df.round({'A': 1, 'C': 2})
          A         B     C
first   0.0  0.992815  0.17
second  0.0  0.645646  0.58
third   0.9  0.149370  0.49
>>> decimals = pd.Series([1, 0, 2], index=['A', 'B', 'C'])
>>> decimals
A    1
B    0
C    2
dtype: int64
>>> df.round(decimals)
          A  B     C
first   0.0  1  0.17
second  0.0  1  0.58
third   0.9  0  0.49

利用python进行数据分析

ch02

import json
path = 'E:\PYTHON\PydataProject\ch02\data\example.txt'
a = open(path).readline()
records = [json.loads(line) for line in open(path)]
for index,item in enumerate(records, 1):
    print(index, item)
-----------------------------------------------    
3559 {'a': 'GoogleProducer', 'c': 'US', 'nk': 0, 'tz': 'America/Los_Angeles', 'gr': 'CA', 'g': 'zjtI4X', 'h': 'zjtI4X', 'l': 'bitly', 'hh': '1.usa.gov', 'r': 'direct', 'u': 'http://www.ahrq.gov/qual/qitoolkit/', 't': 1331926847, 'hc': 1327528527, 'cy': 'Mountain View', 'll': [37.419201, -122.057404]}
3560 {'a': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; MS-RTC LM 8; .NET4.0C; .NET4.0E; .NET CLR 1.1.4322)', 'c': 'US', 'nk': 0, 'tz': 'America/New_York', 'gr': 'VA', 'g': 'qxKrTK', 'h': 'qxKrTK', 'l': 'bitly', 'al': 'en-US', 'hh': '1.usa.gov', 'r': 'http://t.co/OEEEvwjU', 'u': 'http://herndon-va.gov/Content/public_safety/Public_Information/weekly_reports.aspx?cnlid=1736', 't': 1331926849, 'hc': 1312897670, 'cy': 'Mc Lean', 'll': [38.935799, -77.162102]}
    
records[:1]
Out[8]: 
[{'a': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.78 Safari/535.11',
  'al': 'en-US,en;q=0.8',
  'c': 'US',
  'cy': 'Danvers',
  'g': 'A6qOVH',
  'gr': 'MA',
  'h': 'wfLQtf',
  'hc': 1331822918,
  'hh': '1.usa.gov',
  'l': 'orofrog',
  'll': [42.576698, -70.954903],
  'nk': 1,
  'r': 'http://www.facebook.com/l/7AQEFzjSi/1.usa.gov/wfLQtf',
  't': 1331923247,
  'tz': 'America/New_York',
  'u': 'http://www.ncbi.nlm.nih.gov/pubmed/22415991'}]

列表推导式，后加 if 判断字段是否存在

time_zones = [rec['tz'] for rec in records if 'tz' in rec]

time_zones[:5]
Out[6]: 
['America/New_York',
 'America/Denver',
 'America/New_York',
 'America/Sao_Paulo',
 'America/New_York']

对时区计数——pandas库

from collections import defaultdict
def get_counts(sequence):
    counts = defaultdict(int)
    for x in sequence:
        counts[x] +=1
    return counts

counts = get_counts(time_zones)
num = counts['America/Los_Angeles']

counts
Out[11]: 
defaultdict(int,
            {'': 521,
             'Africa/Cairo': 3,
             'Africa/Casablanca': 1,
             'Africa/Ceuta': 2,
             'Africa/Johannesburg': 1,
             'Africa/Lusaka': 1,
             'America/Anchorage': 5,
             'America/Argentina/Buenos_Aires': 1
             ……})
             
num
Out[14]: 382

前十位的时区及其计数

def top_counts(count_dict,n=10):
    value_key_pairs = [(count,tz) for tz,count in count_dict.items()]
    value_key_pairs.sort()
    return value_key_pairs[-n:]
top_counts(counts)

----------------------------------------------
top_counts(counts)
Out[25]: 
[(33, 'America/Sao_Paulo'),
 (35, 'Europe/Madrid'),
 (36, 'Pacific/Honolulu'),
 (37, 'Asia/Tokyo'),
 (74, 'Europe/London'),
 (191, 'America/Denver'),
 (382, 'America/Los_Angeles'),
 (400, 'America/Chicago'),
 (521, ''),
 (1251, 'America/New_York')]

使用collections.Counter类计数更加方便

from collections import Counter
counts = Counter(time_zones)
counts.most_common(10)

------------------------------------------
counts.most_common(10)
Out[26]: 
[('America/New_York', 1251),
 ('', 521),
 ('America/Chicago', 400),
 ('America/Los_Angeles', 382),
 ('America/Denver', 191),
 ('Europe/London', 74),
 ('Asia/Tokyo', 37),
 ('Pacific/Honolulu', 36),
 ('Europe/Madrid', 35),
 ('America/Sao_Paulo', 33)]

用pandas对时区进行计数

from pandas import DataFrame,Series
import pandas as pd
import numpy as np

frame = DataFrame(records)

frame[‘tz’]返回的Series对象，该对象有一个
value_counts方法

Series对象

frame['tz'][:10]
tz_count = frame['tz'].value_counts()
tz_count[:10]

--------------------------------------
tz_count[:10]
Out[24]: 
America/New_York       1251
                        521
America/Chicago         400
America/Los_Angeles     382
America/Denver          191
Europe/London            74
Asia/Tokyo               37
Pacific/Honolulu         36
Europe/Madrid            35
America/Sao_Paulo        33
Name: tz, dtype: int64

matplotlib数据可视化

先处理记录中未知或缺失的时区填上一个替代值

fillna函数可以替代缺失值NA，未知值（空字符串）则可以通过布尔型数组索引加以替换

1	tz_count[:10].plot(kind='barh',rot = 0)

split()

dropna()

results = Series(x.split()[0] for x in frame.a.dropna()) 
results[:5] 
results.value_counts()[:8]
Out[20]: 
Mozilla/5.0                 2594
Mozilla/4.0                  601
GoogleMaps/RochesterNY       121
Opera/9.80                    34
TEST_INTERNET_AGENT           24
GoogleProducer                21
Mozilla/6.0                    5
BlackBerry8520/5.0.0.681       4
dtype: int64

cframe = frame[frame.a.notnull()]

cframe = frame[frame.a.notnull()]
operating_system =np.where(cframe['a'].str.contains('Windows'),'Windows','Not Windows')
operating_system[:5]
by_tz_os = cframe.groupby(['tz',operating_system])

size()
unstack()

对分组结果进行计数，并利用unstack对计数结果进行重塑

agg_counts = by_tz_os.size().unstack().fillna(0)

agg_counts
Out[19]: 
                                Not Windows  Windows
tz                                                  
                                      245.0    276.0
Africa/Cairo                            0.0      3.0
Africa/Casablanca                       0.0      1.0
Africa/Ceuta                            0.0      2.0
Africa/Johannesburg                     0.0      1.0
Africa/Lusaka                           0.0      1.0
America/Anchorage                       4.0      1.0
America/Argentina/Buenos_Aires          1.0      0.0
America/Argentina/Cordoba               0.0      1.0
America/Argentina/Mendoza               0.0      1.0
America/Bogota                          1.0      2.0
America/Caracas                         0.0      1.0
America/Chicago                       115.0    285.0