70 lines
1.7 KiB
Org Mode
70 lines
1.7 KiB
Org Mode
|
* Import
|
||
|
#+begin_src python
|
||
|
import json
|
||
|
import requests
|
||
|
import pandas as pd
|
||
|
from mastodon import Mastodon # to get the user id
|
||
|
from datetime import date # to get the current date
|
||
|
import subprocess # for getting access token from pass
|
||
|
from bs4 import BeautifulSoup # to more easily read the html output
|
||
|
#+end_src
|
||
|
* Get/refresh data
|
||
|
- I used [[https://jrashford.com/2023/02/13/how-to-scrape-mastodon-timelines-using-python-and-pandas/][this]] setup.
|
||
|
#+begin_src python
|
||
|
# Get access token
|
||
|
personal_access_token=subprocess.check_output(["pass", 'mastodon/access_token']).strip().decode('utf-8')
|
||
|
|
||
|
# Set up access
|
||
|
instance = "https://social.edu.nl"
|
||
|
mastodon = Mastodon(api_base_url=instance, access_token=personal_access_token)
|
||
|
|
||
|
# Get user's info
|
||
|
me = mastodon.me()
|
||
|
my_id = me["id"]
|
||
|
|
||
|
URL = f'{instance}/api/v1/accounts/{my_id}/statuses'
|
||
|
params = {
|
||
|
'limit': 40
|
||
|
}
|
||
|
|
||
|
results = []
|
||
|
|
||
|
while True:
|
||
|
r = requests.get(URL, params=params)
|
||
|
toots = json.loads(r.text)
|
||
|
|
||
|
if len(toots) == 0:
|
||
|
break
|
||
|
|
||
|
results.extend(toots)
|
||
|
|
||
|
max_id = toots[-1]['id']
|
||
|
params['max_id'] = max_id
|
||
|
|
||
|
df = pd.DataFrame(results)
|
||
|
|
||
|
current_date = date.today()
|
||
|
current_dir="/".join(inspect.getfile(inspect.currentframe()).split("/")[:-1])
|
||
|
file_name_save=f'{current_dir}/mydata_{current_date}.csv'
|
||
|
df.to_csv(file_name_save, index=False)
|
||
|
|
||
|
#+end_src
|
||
|
|
||
|
* Use/search data
|
||
|
Use existing data multiple times.
|
||
|
#+begin_src python
|
||
|
df=pd.read_csv(file_name_save)
|
||
|
|
||
|
query="test"
|
||
|
|
||
|
# Search for words
|
||
|
for i in df['content']:
|
||
|
if isinstance(i,str):
|
||
|
if query in i:
|
||
|
soup = BeautifulSoup(i, 'html.parser')
|
||
|
readable_text = soup.get_text(separator=' ', strip=True)
|
||
|
print(readable_text)
|
||
|
print("----")
|
||
|
#+end_src
|
||
|
|