From 4a1259599c0d08d913fa1a9b6cc82415cd52ce63 Mon Sep 17 00:00:00 2001 From: Misha Date: Tue, 8 Aug 2023 11:36:48 -0500 Subject: [PATCH] Some major changes and improvements --- mastodon-get-posts.org | 82 +++++++++++++++++++++++++++--------------- 1 file changed, 53 insertions(+), 29 deletions(-) diff --git a/mastodon-get-posts.org b/mastodon-get-posts.org index 56456e8..615ed57 100644 --- a/mastodon-get-posts.org +++ b/mastodon-get-posts.org @@ -1,63 +1,87 @@ -* Import +* Initialize #+begin_src python import json # to parse data import requests # to get data -import pandas as pd # work with data -from mastodon import Mastodon # to get the user id -from datetime import date # to get the current date -import subprocess # for getting access token from pass -from bs4 import BeautifulSoup # to more easily read the html output -#+end_src + +# get user id +instance = "https://social.edu.nl" +username = "mishavelthuis" +id = json.loads(requests.get(f"{instance}/api/v1/accounts/lookup?acct={username}").text)['id'] + +# get current date +current_date = date.today() + +# Create filename for data output +#current_dir="/".join(inspect.getfile(inspect.currentframe()).split("/")[:-1]) +download_dir=os.path.expanduser("~/Downloads") +file_name_save=f'{download_dir}/mydata_{current_date}_{username}.csv' +#+end_src * Get/refresh data - I used [[https://jrashford.com/2023/02/13/how-to-scrape-mastodon-timelines-using-python-and-pandas/][this]] setup. - Only have to be refreshed (run) every now and then #+begin_src python -# Get user id -instance = "https://social.edu.nl" -username = "mishavelthuis" -id = json.loads(requests.get("https://social.edu.nl/api/v1/accounts/lookup?acct=mishavelthuis").text)['id'] +import json # to parse data +import requests # to get data +import pandas as pd # work with data +from datetime import date # to get the current date +import subprocess # for getting access token from pass +import os # to remove file -URL = f'{instance}/api/v1/accounts/{id}/statuses' +# To not append to existing file +os.remove(file_name_save) + +url = f'{instance}/api/v1/accounts/{id}/statuses' params = { 'limit': 40 } results = [] +num_done = 0 while True: - r = requests.get(URL, params=params) - toots = json.loads(r.text) + print(f'{num_done} statuses downloaded') + try: + r = requests.get(url, params=params) + toots = json.loads(r.text) + except: + print("request didn't work") if len(toots) == 0: break - results.extend(toots) - - max_id = toots[-1]['id'] - params['max_id'] = max_id - -df = pd.DataFrame(results) + try: + max_id = toots[-1]['id'] + params['max_id'] = max_id + except Exception as error: + print("An error occurred with max_id:", error) -current_date = date.today() -current_dir="/".join(inspect.getfile(inspect.currentframe()).split("/")[:-1]) -file_name_save=f'{current_dir}/mydata_{current_date}_{username}.csv' -df.to_csv(file_name_save, index=False) + num_done=num_done+40 + try: + df = pd.DataFrame(toots) + df.to_csv(file_name_save, mode='a', index=False) + except Exception as error: + print("An error occurred with df:", error) + num_done=num_done-40 #+end_src - * Use/search data - You don't have to load all data for every search. #+begin_src python +import pandas as pd # work with data +from bs4 import BeautifulSoup # to more easily read the html output + df=pd.read_csv(file_name_save) query="test" # Search for words -for i in df['content']: - if isinstance(i,str): - if query in i: - soup = BeautifulSoup(i, 'html.parser') +for index, i in df.iterrows(): + if isinstance(i['content'],str): + if query in i['content']: + soup = BeautifulSoup(i['content'], 'html.parser') readable_text = soup.get_text(separator=' ', strip=True) + print(i['url']) + print(i['created_at']) print(readable_text) print("----") #+end_src