Some major changes and improvements

This commit is contained in:
Misha 2023-08-08 11:36:48 -05:00
parent e3324bbb60
commit 4a1259599c

View file

@ -1,63 +1,87 @@
* Import * Initialize
#+begin_src python #+begin_src python
import json # to parse data import json # to parse data
import requests # to get data import requests # to get data
import pandas as pd # work with data
from mastodon import Mastodon # to get the user id # get user id
from datetime import date # to get the current date instance = "https://social.edu.nl"
import subprocess # for getting access token from pass username = "mishavelthuis"
from bs4 import BeautifulSoup # to more easily read the html output id = json.loads(requests.get(f"{instance}/api/v1/accounts/lookup?acct={username}").text)['id']
# get current date
current_date = date.today()
# Create filename for data output
#current_dir="/".join(inspect.getfile(inspect.currentframe()).split("/")[:-1])
download_dir=os.path.expanduser("~/Downloads")
file_name_save=f'{download_dir}/mydata_{current_date}_{username}.csv'
#+end_src #+end_src
* Get/refresh data * Get/refresh data
- I used [[https://jrashford.com/2023/02/13/how-to-scrape-mastodon-timelines-using-python-and-pandas/][this]] setup. - I used [[https://jrashford.com/2023/02/13/how-to-scrape-mastodon-timelines-using-python-and-pandas/][this]] setup.
- Only have to be refreshed (run) every now and then - Only have to be refreshed (run) every now and then
#+begin_src python #+begin_src python
# Get user id import json # to parse data
instance = "https://social.edu.nl" import requests # to get data
username = "mishavelthuis" import pandas as pd # work with data
id = json.loads(requests.get("https://social.edu.nl/api/v1/accounts/lookup?acct=mishavelthuis").text)['id'] from datetime import date # to get the current date
import subprocess # for getting access token from pass
import os # to remove file
URL = f'{instance}/api/v1/accounts/{id}/statuses' # To not append to existing file
os.remove(file_name_save)
url = f'{instance}/api/v1/accounts/{id}/statuses'
params = { params = {
'limit': 40 'limit': 40
} }
results = [] results = []
num_done = 0
while True: while True:
r = requests.get(URL, params=params) print(f'{num_done} statuses downloaded')
toots = json.loads(r.text) try:
r = requests.get(url, params=params)
toots = json.loads(r.text)
except:
print("request didn't work")
if len(toots) == 0: if len(toots) == 0:
break break
results.extend(toots) try:
max_id = toots[-1]['id']
params['max_id'] = max_id
except Exception as error:
print("An error occurred with max_id:", error)
max_id = toots[-1]['id'] num_done=num_done+40
params['max_id'] = max_id
df = pd.DataFrame(results)
current_date = date.today()
current_dir="/".join(inspect.getfile(inspect.currentframe()).split("/")[:-1])
file_name_save=f'{current_dir}/mydata_{current_date}_{username}.csv'
df.to_csv(file_name_save, index=False)
try:
df = pd.DataFrame(toots)
df.to_csv(file_name_save, mode='a', index=False)
except Exception as error:
print("An error occurred with df:", error)
num_done=num_done-40
#+end_src #+end_src
* Use/search data * Use/search data
- You don't have to load all data for every search. - You don't have to load all data for every search.
#+begin_src python #+begin_src python
import pandas as pd # work with data
from bs4 import BeautifulSoup # to more easily read the html output
df=pd.read_csv(file_name_save) df=pd.read_csv(file_name_save)
query="test" query="test"
# Search for words # Search for words
for i in df['content']: for index, i in df.iterrows():
if isinstance(i,str): if isinstance(i['content'],str):
if query in i: if query in i['content']:
soup = BeautifulSoup(i, 'html.parser') soup = BeautifulSoup(i['content'], 'html.parser')
readable_text = soup.get_text(separator=' ', strip=True) readable_text = soup.get_text(separator=' ', strip=True)
print(i['url'])
print(i['created_at'])
print(readable_text) print(readable_text)
print("----") print("----")
#+end_src #+end_src