1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116
|
import os import re import urllib2 import sys reload(sys) sys.setdefaultencoding('utf8')
def askURL(url): request = urllib2.Request(url) try: response = urllib2.urlopen(request) html= response.read() html=html.decode('gbk','ignore') html=html.encode('utf-8','ignore') except urllib2.URLError, e: if hasattr(e,"code"): print e.code if hasattr(e,"reason"): print e.reason return html
def getContent(url): html=askURL(url) text='' findDiv=re.compile(r'<div class="left_zw" style="position:relative">' r'(.*)<div id="function_code_page">',re.S) div=re.findall(findDiv,html) if len(div)!=0: content=div[0] labels=re.compile(r'[/]*<.*?>',re.S) text=re.sub(labels,'',content) text=re.sub(r'\s|中新社.*?电|\(完\)|\(记者.*?\)','',text) text=re.sub(r' ','\n',text) return text
def saveFile(labelName,date,fileNum): dirname="news" if(not os.path.exists(dirname)): os.mkdir(dirname) labelName=labelName.encode('gbk','ignore') labelpath=dirname+'\\'+labelName if(not os.path.exists(labelpath)): os.mkdir(labelpath) path=labelpath+"\\"+date+"-0"+str(fileNum)+".txt" print "正在下载"+path f=open(path,'w+') return f
def getURL(nurl,labelName): html=askURL(nurl) findDiv=re.compile(r'<div class="dd_lm">.*</div>') findTime=re.compile(r'<div class="dd_time">(.*)</div>') findTitle=re.compile(r'<div class="dd_bt"><a href="http://www.chinanews.com/.*\.shtml">(.*)</a></div>') findURL=re.compile(r'<div class="dd_bt"><a href="(http://www.chinanews.com/.*\.shtml)">.*</a></div>') findLabel=re.compile(r'<div class="dd_lm">\[<a href=http://www.chinanews.com/.*\.shtml>(.*)</a>\]</div>') fileNum=0 for info in re.findall(findDiv,html): time=re.findall(findTime,info)[0] date=re.findall(r'\d?-\d*',time)[0] title=re.findall(findTitle,info)[0] url=re.findall(findURL,info)[0] label=re.findall(findLabel,info)[0] if(label=="I T"): label="IT" if(labelName==label): text=getContent(url) if(len(text)>1000): fileNum=fileNum+1 f=saveFile(labelName,date,fileNum) f.write(title) f.write(text) f.close()
def getNews(url,begin_page,end_page,labelName): for i in range(begin_page, end_page+1): nurl=url+str(i)+".html" getURL(nurl,labelName)
def main(): url='http://www.chinanews.com/scroll-news/news' ch=int(raw_input(u'请输入类别的对应的数字(IT=1、财经=2、地方=3、国际=4、国内=5、健康=6、军事=7、' u'社会=8、体育=9、文化=10),输入-1退出,输入0表示全选:\n')) labels=('IT','财经','地方',"国际","国内","健康","军事","社会","体育","文化") while(ch!=-1): begin_page = int(raw_input(u'请输入开始的页数(1,):\n')) end_page = int(raw_input(u'请输入终点的页数(1,):\n')) if(ch>=1 and ch<=10): getNews(url,begin_page,end_page,labels[ch-1]) elif(ch==0): for label in labels: getNews(url,begin_page,end_page,label) else: print "输入错误,请重新输入!" ch=int(raw_input(u'请输入类别的对应的数字(IT=1、财经=2、地方=3、国际=4、国内=5、健康=6、军事=7、' u'社会=8、体育=9、文化=10),输入-1退出,输入0表示全选:\n'))
main()
|