* Initialize #+begin_src python import json # to parse data import requests # to get data # get user id instance = "https://social.edu.nl" username = "mishavelthuis" id = json.loads(requests.get(f"{instance}/api/v1/accounts/lookup?acct={username}").text)['id'] # get current date current_date = date.today() # Create filename for data output #current_dir="/".join(inspect.getfile(inspect.currentframe()).split("/")[:-1]) download_dir=os.path.expanduser("~/Downloads") file_name_save=f'{download_dir}/mydata_{current_date}_{username}.csv' #+end_src * Get/refresh data - I used [[https://jrashford.com/2023/02/13/how-to-scrape-mastodon-timelines-using-python-and-pandas/][this]] setup. - Only have to be refreshed (run) every now and then #+begin_src python import json # to parse data import requests # to get data import pandas as pd # work with data from datetime import date # to get the current date import subprocess # for getting access token from pass import os # to remove file # To not append to existing file os.remove(file_name_save) url = f'{instance}/api/v1/accounts/{id}/statuses' params = { 'limit': 40 } results = [] num_done = 0 while True: print(f'{num_done} statuses downloaded') try: r = requests.get(url, params=params) toots = json.loads(r.text) except: print("request didn't work") if len(toots) == 0: break try: max_id = toots[-1]['id'] params['max_id'] = max_id except Exception as error: print("An error occurred with max_id:", error) num_done=num_done+40 try: df = pd.DataFrame(toots) df.to_csv(file_name_save, mode='a', index=False) except Exception as error: print("An error occurred with df:", error) num_done=num_done-40 #+end_src * Use/search data - You don't have to load all data for every search. #+begin_src python import pandas as pd # work with data from bs4 import BeautifulSoup # to more easily read the html output df=pd.read_csv(file_name_save) query="test" # Search for words for index, i in df.iterrows(): if isinstance(i['content'],str): if query in i['content']: soup = BeautifulSoup(i['content'], 'html.parser') readable_text = soup.get_text(separator=' ', strip=True) print(i['url']) print(i['created_at']) print(readable_text) print("----") #+end_src