wwu-577/src/broken_links.py

44 lines
1.2 KiB
Python

import requests
import seaborn as sns
import matplotlib.pyplot as plt
import click
from data import connect
@click.command(name="broken:crawl")
def crawl():
"""crawl story urls checking for link rot or redirects."""
DB = connect()
urls = DB.query("""
select
id
,url
from stories
order by published_at asc
limit 5
""").fetchall()
DB.close()
story_id, url = urls[1]
# url
responses = []
for story_id, url in urls:
out = {'story_id' : story_id, 'final_url' : url, 'timeout' : 0, 'status_code' : 200, 'content_length' : 0}
try:
response = requests.get(url, verify=False, timeout=10)
if len(response.history) > 1:
out['redirect'] = 1
if url != response.url:
out['final_url'] = response.url
out['status_code'] = response.status_code
out['content_length'] = len(response.content)
except requests.exceptions.ReadTimeout as e:
print(f"timeout: {url}")
out['timeout'] = 1
responses.append(out)
sns.histplot(x=hist['cnt'])
plt.show()