import urllib.request
url = "https://disp.cc/b/PttHot" # The website url you want to access
response = urllib.request.urlopen(url)
data = response.read()
text = data.decode('utf-8-sig')
print(text)
執行
py -3 main.py
可以看到這個網頁目前的 html 已經都抓下來了
現在的目標就是抓取"你要抓的文字"
這個部分 python3 已經有BeautifulSoup包好了
from bs4 import BeautifulSoup
soup = BeautifulSoup(text, "lxml") # parse
回到剛剛上面利用 F12 所看到的 html 發現它的標題內容都是放在
span class = 'L34 nowrap listTitle', id = 'titleXXXXX'
import urllib.request
from bs4 import BeautifulSoup def downLoad(): url = "https://disp.cc/b/PttHot" response = urllib.request.urlopen(url) data = response.read() # a `bytes` object text = data.decode('utf-8-sig') # a `str`; this step can't be used if data is binary return text print("downloading.. ")
print("=============================")
text = downLoad()
soup = BeautifulSoup(text, "lxml") # parse
listIdxs = soup.body.find_all('span', attrs={'class':'list-num'}) # get all list
for listIdx in listIdxs: targets = soup.body.find_all('span', attrs={'class':'L34 nowrap listTitle', 'id':'title'+listIdx.text}) for ta in targets: print(ta.text)