* Import #+begin_src python import json # to parse data import requests # to get data import pandas as pd from mastodon import Mastodon # to get the user id from datetime import date # to get the current date import subprocess # for getting access token from pass from bs4 import BeautifulSoup # to more easily read the html output #+end_src * Get/refresh data - I used [[https://jrashford.com/2023/02/13/how-to-scrape-mastodon-timelines-using-python-and-pandas/][this]] setup. - Only have to be refreshed (run) every now and then #+begin_src python # Get access token personal_access_token=subprocess.check_output(["pass", 'mastodon/access_token']).strip().decode('utf-8') # Set up access instance = "https://social.edu.nl" mastodon = Mastodon(api_base_url=instance, access_token=personal_access_token) # Get user's info me = mastodon.me() my_id = me["id"] URL = f'{instance}/api/v1/accounts/{my_id}/statuses' params = { 'limit': 40 } results = [] while True: r = requests.get(URL, params=params) toots = json.loads(r.text) if len(toots) == 0: break results.extend(toots) max_id = toots[-1]['id'] params['max_id'] = max_id df = pd.DataFrame(results) current_date = date.today() current_dir="/".join(inspect.getfile(inspect.currentframe()).split("/")[:-1]) file_name_save=f'{current_dir}/mydata_{current_date}.csv' df.to_csv(file_name_save, index=False) #+end_src * Use/search data - You don't have to load all data for every search. #+begin_src python df=pd.read_csv(file_name_save) query="test" # Search for words for i in df['content']: if isinstance(i,str): if query in i: soup = BeautifulSoup(i, 'html.parser') readable_text = soup.get_text(separator=' ', strip=True) print(readable_text) print("----") #+end_src