search-mastodon/mastodon-get-posts.org at 4a1259599c0d08d913fa1a9b6cc82415cd52ce63

Misha 4a1259599c Some major changes and improvements

2023-08-08 11:36:48 -05:00

2.4 KiB

Raw Blame History

Initialize
Get/refresh data
Use/search data

Initialize

import json  # to parse data
import requests # to get data

# get user id
instance = "https://social.edu.nl"
username = "mishavelthuis"
id = json.loads(requests.get(f"{instance}/api/v1/accounts/lookup?acct={username}").text)['id']

# get current date
current_date = date.today()

# Create filename for data output
#current_dir="/".join(inspect.getfile(inspect.currentframe()).split("/")[:-1])
download_dir=os.path.expanduser("~/Downloads")
file_name_save=f'{download_dir}/mydata_{current_date}_{username}.csv'

Get/refresh data

I used this setup.
Only have to be refreshed (run) every now and then

import json  # to parse data
import requests # to get data
import pandas as pd # work with data
from datetime import date # to get the current date
import subprocess # for getting access token from pass
import os # to remove file

# To not append to existing file
os.remove(file_name_save)

url = f'{instance}/api/v1/accounts/{id}/statuses'
params = {
    'limit': 40
}

results = []
num_done = 0

while True:
    print(f'{num_done} statuses downloaded')
    try:
        r = requests.get(url, params=params)
        toots = json.loads(r.text)
    except:
        print("request didn't work")

    if len(toots) == 0:
        break
    
    try:
        max_id = toots[-1]['id']
        params['max_id'] = max_id
    except Exception as error:
        print("An error occurred with max_id:", error) 

    num_done=num_done+40

    try:
        df = pd.DataFrame(toots)
        df.to_csv(file_name_save, mode='a', index=False)
    except Exception as error:
        print("An error occurred with df:", error) 
        num_done=num_done-40

Use/search data

You don't have to load all data for every search.

import pandas as pd # work with data
from bs4 import BeautifulSoup # to more easily read the html output

df=pd.read_csv(file_name_save)

query="test"

# Search for words
for index, i in df.iterrows():   
    if isinstance(i['content'],str):
      if query in i['content']:
          soup = BeautifulSoup(i['content'], 'html.parser')
          readable_text = soup.get_text(separator=' ', strip=True)
          print(i['url'])
          print(i['created_at'])
          print(readable_text)
          print("----")

2.4 KiB Raw Blame History

Initialize

Get/refresh data

Use/search data

2.4 KiB

Raw Blame History