TwitterUsersProfiling/GetFollowing.py at master · DuncanZhou/TwitterUsersProfiling · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/python
#-*-coding:utf-8-*-
'''@author:duncan'''

import requests
import json
import time
import random
import urllib2
import re
from BeautifulSoup import BeautifulSoup
import MySQLdb as mysql
import cookielib
import urllib
import threading

def crawlDetailPage(url):
    # 使用代理ip
    proxy ={'http':"http://183.159.91.135:18118",
            'http':'http://183.128.32.204:18118',
            'http':'http://125.127.158.162:61234',
            'https':'https://60.177.226.43:18118',
            'http':'http://111.183.231.253:61234'}
    ids = set()
    req = requests.get(url,proxies=proxy)
    jsondata = req.text
    data = json.loads(jsondata)

    content = data['data']['cards']

    for i in content:
        followingId = i['user']['id']
        ids.add(str(followingId))
    return ids

# 解析cookie
def ParseCookie(cookie):
    res = {}
    cookies = cookie.split(";")
    for cook in cookies:
        key,value = cook.split("=")
        res[key.lstrip().rstrip()] = value.lstrip().rstrip()
    return res

def GetContainerId(id):
    url = "https://weibo.com/" + id
    cookies = "SINAGLOBAL=4043605503975.1895.1479819188716; UM_distinctid=1621ef12ef35c9-010737f44f7c2d-3a75045d-1fa400-1621ef12ef45eb; _s_tentry=ent.ifeng.com; YF-V5-G0=b1e3c8e8ad37eca95b65a6759b3fc219; Apache=1879352856775.3337.1523458056359; ULV=1523458056368:63:1:1:1879352856775.3337.1523458056359:1522138879467; YF-Page-G0=ee5462a7ca7a278058fd1807a910bc74; YF-Ugrow-G0=b02489d329584fca03ad6347fc915997; login_sid_t=bbb9fec459ca99f483b3f769202275a0; cross_origin_proto=SSL; wb_cmtLike_6050650176=1; appkey=; WBtopGlobal_register_version=25c556e6eb9b606e; un=13776120509; WBStorage=96e2695964e412de|undefined; UOR=baike.baidu.com,widget.weibo.com,login.sina.com.cn; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WhWjTyGjFD-0cyZI7kC7IYg5JpX5K2hUgL.Foe4S0eEShM0Sh22dJLoI0YLxKBLBonL1h5LxK-L1hnL1hMLxK.L1heLB.x4ICH8Sb-4SEHWeFH8Sb-R1C-ReFH81FHFeF-4e05pe8Yf1K-t; ALF=1556764397; SSOLoginState=1525228398; SCF=An2pxqirryGJ7asCvq8npDbZpxwysghsp8yKuZvilRtrh0zFAoo3eBbnLn1KC2mL_lo4_H_nCbctjb0sJ3ONd8k.; SUB=_2A2537VM-DeRhGeVH7FET9CnPzz2IHXVUm8P2rDV8PUNbmtBeLW7hkW9NTv9QmISskVXtb_jtCvl7DXUn8lSW3Exb; SUHB=0RyaX0Q7gYyzxE; wvr=6"
    cookies = ParseCookie(cookies)
    headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
               'Accept-Language': 'zh-CN,zh;q=0.9',
               'Cache-Control': 'max-age=0',
               'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36',
               'Connection': 'keep-alive',
               # 'Host': 'weibo.com',
               'Upgrade-Insecure-Requests': '1',
               'Refer': "https://weibo.com/"
               }
    cj = cookielib.CookieJar()
    opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
    # 先获取cookie
    login = {"username":"13776120509","password":"aaasssddd"}
    # res = opener.open("https://weibo.com")
    # for item in cj:
    #     print item.name + " " + item.value

    # cj.set_cookie(cookies)
    post_data = urllib.urlencode(login)
    # print headers
    req = urllib2.Request(url,post_data,headers)
    # response = urllib2.urlopen(req)
    response = urllib2.urlopen(req)
    data = response.read()
    # print data
    soup = BeautifulSoup(data)
    results = soup.findAll("script",attrs={'type':'text/javascript'})
    string = str(results)
    r = re.findall(r"\[\'page_id\'\]=\'(\d+)\'",string)
    return r[0]

def GetFollowing(pid):
    db = mysql.connect(host="192.168.131.191",port=3306,db="weibo", user="root",passwd="", charset='utf8' )
    cursor = db.cursor()

    cursor.execute("select id from users")
    total_ids = set()
    results = list(cursor.fetchall())
    for res in results:
        total_ids.add(res[0])
    famous_ids = set()
    cursor.execute("select id from users where category != 'Common'")
    results = list(cursor.fetchall())
    for res in results:
        famous_ids.add(res[0])
    count = 0
    period = len(famous_ids) / 10
    start = (pid - 1) * period
    end = pid * period
    # 对famous抓取关注的用户列表
    for id in famous_ids:
        # 先获取container_id
        count += 1
        if count <= 65 or count < start:
            continue
        if count > end:
            break
        # 如果数据库中有了则跳过
        cursor.execute("select count(*) from relationships where suid = '%s'" % id)
        results = cursor.fetchone()
        if results[0] >= 1:
            continue
        following = Following(str("100505" + id))
        # 在数据库中的,插入到target_db中
        following &= total_ids
        print len(following)
        for tuid in following:
            cursor.execute("insert into relationships(suid,tuid) values('%s','%s')" % (id,tuid))
        db.commit()

        print "已完成%d个用户" % count
        # time.sleep(random.choice([0,1,2,3,4,5]))

    cursor.close()
    db.close()

# 得到关注列表
def Following(uid):

    print "正在获取%s用户的关注列表" % uid
    following = set()
    url = "https://m.weibo.cn/api/container/getSecond?containerid=%s_-_FOLLOWERS&page=" % uid
    pageid = 1
    flag = True
    while flag:
        try:
            following |= crawlDetailPage(url+str(pageid))
            time.sleep(random.choice([1,2,3,4]))
            pageid += 1
            print "第%d页" % pageid
            if pageid > 100:
                break
        except Exception as e:
            flag = False
    return following

# i/o密集型应该使用多线程
def run():
    pid = 1
    while pid <= 10:
        t = threading.Thread(target=GetFollowing,args=(pid,))
        t.start()
        pid += 1
run()

# GetFollowing(1)
# Following("1005053973247341")