search-mastodon/mastodon-get-posts.org at e3324bbb608e3c6c1f1063359e057b6b16a54b97

Misha e3324bbb60 Added username to the filename

2023-08-08 01:25:26 -05:00

1.7 KiB

Raw Blame History

Import
Get/refresh data
Use/search data

Import

import json  # to parse data
import requests # to get data
import pandas as pd # work with data
from mastodon import Mastodon # to get the user id
from datetime import date # to get the current date
import subprocess # for getting access token from pass
from bs4 import BeautifulSoup # to more easily read the html output

Get/refresh data

I used this setup.
Only have to be refreshed (run) every now and then

# Get user id
instance = "https://social.edu.nl"
username = "mishavelthuis"
id = json.loads(requests.get("https://social.edu.nl/api/v1/accounts/lookup?acct=mishavelthuis").text)['id']

URL = f'{instance}/api/v1/accounts/{id}/statuses'
params = {
    'limit': 40
}

results = []

while True:
    r = requests.get(URL, params=params)
    toots = json.loads(r.text)

    if len(toots) == 0:
        break
    
    results.extend(toots)
    
    max_id = toots[-1]['id']
    params['max_id'] = max_id
    
df = pd.DataFrame(results)

current_date = date.today()
current_dir="/".join(inspect.getfile(inspect.currentframe()).split("/")[:-1])
file_name_save=f'{current_dir}/mydata_{current_date}_{username}.csv'
df.to_csv(file_name_save, index=False)

Use/search data

You don't have to load all data for every search.

df=pd.read_csv(file_name_save)

query="test"

# Search for words
for i in df['content']:
    if isinstance(i,str):
      if query in i:
          soup = BeautifulSoup(i, 'html.parser')
          readable_text = soup.get_text(separator=' ', strip=True)
          print(readable_text)
          print("----")

1.7 KiB Raw Blame History

Import

Get/refresh data

Use/search data

1.7 KiB

Raw Blame History