- bs4
- BeautifulSoup
Beautiful Soup to parse HTML
examples/web-client/parse_html.py
from bs4 import BeautifulSoup import requests url = 'https://en.wikipedia.org/wiki/Main_Page' res = requests.get(url) if res.status_code != 200: exit(f"Error in getting the page. Status code: {res.status_code}") html = res.content soup = BeautifulSoup(html, features='lxml') print(soup.title.text) for link in soup.find_all("a", limit=3): print(link) print(link.text) print(link.attrs.get('href')) print() print('-----------------------------------------') forms = soup.select("#searchform") if forms is not None: print(forms) form = forms[0] # We used an ID to search we expect to have 0 or one matches in the list print() print('Action: ', form.attrs.get('action')) # Search inside that element we found earlier for inp in form.find_all('input'): print('id: ', inp.attrs.get('id')) print('-----------------------------------------') tfa = soup.select("#mp-tfa") if tfa is not None: #print(tfa) paras = tfa[0].select("p") if paras is not None: #print(paras) links = paras[0].find_all("a", limit=1) if links: print(links[0].text) print(links[0].attrs.get('href'))