Exercise: Fetch URLs in parallel
- top-websites
- Given a file with a list of URLs, collect the title of each site.
examples/parallel/urls.txt
https://google.com/ https://youtube.com/ https://facebook.com/ https://baidu.com/ https://twitter.com/ https://instagram.com/ https://en.wikipedia.org/ https://www.amazon.com/ https://yahoo.com/ https://yandex.ru/ https://vk.com/ https://live.com/ https://naver.com/ https://yahoo.co.jp/ https://google.com.br/ https://netflix.com/ https://reddit.com/ https://ok.ru/ https://mail.ru/ https://ebay.com/ https://linkedin.com/ https://qq.com/ https://pinterest.com/ https://bing.com/ https://whatsapp.com/ https://office.com/ https://amazon.de/ https://aliexpress.com/ https://amazon.co.jp/ https://msn.com/ https://google.de/ https://paypal.com/ https://rakuten.co.jp/ https://amazon.co.uk/ https://daum.net/ https://google.co.jp/ https://imdb.com/ https://booking.com/ https://roblox.com/ https://9apps.com/ https://globo.com/ https://duckduckgo.com/ https://www.nttdocomo.co.jp/
examples/parallel/fetch_urls.py
import time import requests import sys from bs4 import BeautifulSoup def get_urls(limit): with open('urls.txt') as fh: urls = list(map(lambda line: line.rstrip("\n"), fh)) if len(urls) > limit: urls = urls[:limit] return urls def get_title(url): try: resp = requests.get(url) if resp.status_code != 200: return None, f"Incorrect status_code {resp.status_code} for {url}" except Exception as err: return None, f"Error: {err} for {url}" soup = BeautifulSoup(resp.content, 'html.parser') return soup.title.string, None def main(): if len(sys.argv) < 2: exit(f"Usage: {sys.argv[0]} LIMIT") limit = int(sys.argv[1]) urls = get_urls(limit) print(urls) start = time.time() titles = [] for url in urls: #print(f"Processing {url}") title, err = get_title(url) if err: print(err) else: print(title) titles.append({ "url": url, "title": title, "err": err, }) end = time.time() print("Elapsed time: {} for {} pages.".format(end-start, len(urls))) print(titles) if __name__ == '__main__': main()