Compare commits

...

2 Commits

Author SHA1 Message Date
Misha 49d1102a45 Added some comments 2023-08-08 01:02:02 -05:00
Misha 929639a394 Small adjustments 2023-08-08 01:01:30 -05:00
2 changed files with 73 additions and 3 deletions

View File

@ -1,7 +1,7 @@
* Import
#+begin_src python
import json
import requests
import json # to parse data
import requests # to get data
import pandas as pd
from mastodon import Mastodon # to get the user id
from datetime import date # to get the current date
@ -10,6 +10,7 @@ from bs4 import BeautifulSoup # to more easily read the html output
#+end_src
* Get/refresh data
- I used [[https://jrashford.com/2023/02/13/how-to-scrape-mastodon-timelines-using-python-and-pandas/][this]] setup.
- Only have to be refreshed (run) every now and then
#+begin_src python
# Get access token
personal_access_token=subprocess.check_output(["pass", 'mastodon/access_token']).strip().decode('utf-8')
@ -51,7 +52,7 @@ df.to_csv(file_name_save, index=False)
#+end_src
* Use/search data
Use existing data multiple times.
- You don't have to load all data for every search.
#+begin_src python
df=pd.read_csv(file_name_save)

69
mastodon-get-posts.org~ Normal file
View File

@ -0,0 +1,69 @@
* Import
#+begin_src python
import json
import requests
import pandas as pd
from mastodon import Mastodon # to get the user id
from datetime import date # to get the current date
import subprocess # for getting access token from pass
from bs4 import BeautifulSoup # to more easily read the html output
#+end_src
* Get/refresh data
- I used [[https://jrashford.com/2023/02/13/how-to-scrape-mastodon-timelines-using-python-and-pandas/][this]] setup.
#+begin_src python
# Get access token
personal_access_token=subprocess.check_output(["pass", 'mastodon/access_token']).strip().decode('utf-8')
# Set up access
instance = "https://social.edu.nl"
mastodon = Mastodon(api_base_url=instance, access_token=personal_access_token)
# Get user's info
me = mastodon.me()
my_id = me["id"]
URL = f'{instance}/api/v1/accounts/{my_id}/statuses'
params = {
'limit': 40
}
results = []
while True:
r = requests.get(URL, params=params)
toots = json.loads(r.text)
if len(toots) == 0:
break
results.extend(toots)
max_id = toots[-1]['id']
params['max_id'] = max_id
df = pd.DataFrame(results)
current_date = date.today()
current_dir="/".join(inspect.getfile(inspect.currentframe()).split("/")[:-1])
file_name_save=f'{current_dir}/mydata_{current_date}.csv'
df.to_csv(file_name_save, index=False)
#+end_src
* Use/search data
Use existing data multiple times.
#+begin_src python
df=pd.read_csv(file_name_save)
query="test"
# Search for words
for i in df['content']:
if isinstance(i,str):
if query in i:
soup = BeautifulSoup(i, 'html.parser')
readable_text = soup.get_text(separator=' ', strip=True)
print(readable_text)
print("----")
#+end_src