44 lines
1.2 KiB
Python
44 lines
1.2 KiB
Python
import requests
|
|
import seaborn as sns
|
|
import matplotlib.pyplot as plt
|
|
import click
|
|
|
|
from data import connect
|
|
|
|
@click.command(name="broken:crawl")
|
|
def crawl():
|
|
"""crawl story urls checking for link rot or redirects."""
|
|
DB = connect()
|
|
|
|
urls = DB.query("""
|
|
select
|
|
id
|
|
,url
|
|
from stories
|
|
order by published_at asc
|
|
limit 5
|
|
""").fetchall()
|
|
|
|
DB.close()
|
|
|
|
story_id, url = urls[1]
|
|
# url
|
|
responses = []
|
|
for story_id, url in urls:
|
|
out = {'story_id' : story_id, 'final_url' : url, 'timeout' : 0, 'status_code' : 200, 'content_length' : 0}
|
|
try:
|
|
response = requests.get(url, verify=False, timeout=10)
|
|
if len(response.history) > 1:
|
|
out['redirect'] = 1
|
|
if url != response.url:
|
|
out['final_url'] = response.url
|
|
out['status_code'] = response.status_code
|
|
out['content_length'] = len(response.content)
|
|
except requests.exceptions.ReadTimeout as e:
|
|
print(f"timeout: {url}")
|
|
out['timeout'] = 1
|
|
responses.append(out)
|
|
|
|
sns.histplot(x=hist['cnt'])
|
|
plt.show()
|