百度利用其強大的中文搜索引擎數(shù)據(jù),結合地圖應用,包含了海量的公司聯(lián)系方式,比Google要強,更別說什么黃頁網(wǎng)站了。 因為一些業(yè)務需要,寫了這個行業(yè)公司地址采集程序,使用方便,直接運行,支持命令行設定查詢參數(shù)。 使用方法: 把代碼保存成bmap.py python bmap.py 或 python bmap.py 服飾廠 運行后會自動采集百度地圖中所有的結果,保存為以tab分割的txt文件,方便導入各種數(shù)據(jù)庫。
018 |
sys.setdefaultencoding('utf-8') |
024 |
def __init__(self, keyword): |
025 |
self.keyword = keyword |
027 |
('b', '(-1599062.039999999,811604.75;24779177.96,8168020.75)'), |
036 |
('t', time.time().__int__()), |
037 |
('tn', 'B_NORMAL_MAP'), |
041 |
self.mapurl = 'http://map.baidu.com/' |
042 |
self.file = open('%s.txt' % keyword, 'w') |
049 |
def _fetch(self, query=None, json=True): |
050 |
data = urllib.urlencode(query) |
051 |
url = self.mapurl + '?' + data |
052 |
opener = urllib.FancyURLopener() |
053 |
data = opener.open(url).read() |
056 |
return self._tojson(data) |
060 |
def _tojson(self, data): |
062 |
js = json.loads(data, 'utf-8') |
069 |
data = self._fetch(self.query) |
071 |
if type(data['content']) is not types.ListType: |
072 |
print 'keyworld error.' |
075 |
self.city = data['content'] |
077 |
if data.has_key('more_city'): |
078 |
for c in data['more_city']: |
079 |
self.city.extend(c['city']) |
081 |
for city in self.city: |
082 |
self.total_num += city['num'] |
084 |
def _get_data(self, city, page=0): |
087 |
('b', '(%s)' % city['geo'].split('|')[1]), |
099 |
('t', time.time().__int__()), |
100 |
('tn', 'B_NORMAL_MAP'), |
101 |
('wd', self.keyword), |
104 |
data = self._fetch(query) |
107 |
def _save(self, content, city): |
116 |
_data = '%s\t%s\t%s\t%s\n' % (city['name'], c['name'], c['addr'], tel) |
117 |
self.file.write(_data) |
118 |
print '(%s/%s) %s[%s/%s]' % (self.count, self.total_num, city['name'], self.count_c, city['num']) |
122 |
pages = abs(-city['num'] / 10) |
123 |
for page in range(0, pages): |
124 |
data = self._get_data(city, page) |
125 |
if data.has_key('content'): |
126 |
self._save(data['content'], city) |
129 |
for city in self.city: |
135 |
if __name__ == '__main__': |
136 |
if sys.argv.__len__() > 1: |
137 |
keyword = sys.argv[1] |
141 |
baidumap = BaiduMap(keyword) |
143 |
print 'CITY: %s' % baidumap.city.__len__() |
144 |
print 'DATA: %s' % baidumap.total_num |
|