search-mastodon/mastodon-get-posts.org

* Initialize
- Run this part for either of the following two sections.
#+begin_src python
import json  # to parse data
import requests # to get data
from datetime import date # to get the current date
import os

# get user id
instance = "https://social.edu.nl"
username = "mishavelthuis"
id = json.loads(requests.get(f"{instance}/api/v1/accounts/lookup?acct={username}").text)['id']

# get current date
current_date = date.today()

# Create filename for data output
#current_dir="/".join(inspect.getfile(inspect.currentframe()).split("/")[:-1])
download_dir=os.path.expanduser("~/Downloads")
file_name_save=f'{download_dir}/mydata_{current_date}_{username}.csv'
#+end_src 
* Get/refresh data
- I used [[https://jrashford.com/2023/02/13/how-to-scrape-mastodon-timelines-using-python-and-pandas/][this]] setup.
- The results are saved in a csv file, so you don't have to download all messages for every text search. (You only have to refresh the data every now and then).
#+begin_src python
import json  # to parse data
import requests # to get data
import pandas as pd # work with data
import subprocess # for getting access token from pass
import os # to remove file

# To start with a fresh file
os.remove(file_name_save)

url = f'{instance}/api/v1/accounts/{id}/statuses'
params = {
    'limit': 40
}

results = []
num_done = 0

while True:
    print(f'{num_done} statuses downloaded')
    try:
        r = requests.get(url, params=params)
        toots = json.loads(r.text)
    except:
        print("request didn't work")

    if len(toots) == 0:
        break
    
    try:
        max_id = toots[-1]['id']
        params['max_id'] = max_id
    except Exception as error:
        print("An error occurred with max_id:", error) 

    num_done=num_done+40

    try:
        df = pd.DataFrame(toots)
        df.to_csv(file_name_save, mode='a', index=False)
    except Exception as error:
        print("An error occurred with df:", error) 
        num_done=num_done-40
#+end_src
* Use/search data
- You can use the csv-file saved in the previous section to search posts.
#+begin_src python
import pandas as pd # work with data
from bs4 import BeautifulSoup # to more easily read the html output

df=pd.read_csv(file_name_save)

query="test"

# Search for words
for index, i in df.iterrows():   
    if isinstance(i['content'],str):
      if query in i['content']:
          soup = BeautifulSoup(i['content'], 'html.parser')
          readable_text = soup.get_text(separator=' ', strip=True)
          print(i['url'])
          print(i['created_at'])
          print(readable_text)
          print("----")
#+end_src
Some major changes and improvements 2023-08-08 16:36:48 +00:00			`* Initialize`
Some new comments 2023-08-08 22:28:22 +00:00			`- Run this part for either of the following two sections.`
first commit 2023-08-08 05:58:15 +00:00			`#+begin_src python`
Added some comments 2023-08-08 06:02:02 +00:00			`import json # to parse data`
			`import requests # to get data`
Some new comments 2023-08-08 22:28:22 +00:00			`from datetime import date # to get the current date`
			`import os`
Some major changes and improvements 2023-08-08 16:36:48 +00:00
			`# get user id`
			`instance = "https://social.edu.nl"`
			`username = "mishavelthuis"`
			`id = json.loads(requests.get(f"{instance}/api/v1/accounts/lookup?acct={username}").text)['id']`

			`# get current date`
			`current_date = date.today()`

			`# Create filename for data output`
			`#current_dir="/".join(inspect.getfile(inspect.currentframe()).split("/")[:-1])`
			`download_dir=os.path.expanduser("~/Downloads")`
			`file_name_save=f'{download_dir}/mydata_{current_date}_{username}.csv'`
			`#+end_src`
first commit 2023-08-08 05:58:15 +00:00			`* Get/refresh data`
			`- I used [[https://jrashford.com/2023/02/13/how-to-scrape-mastodon-timelines-using-python-and-pandas/][this]] setup.`
Some new comments 2023-08-08 22:28:22 +00:00			`- The results are saved in a csv file, so you don't have to download all messages for every text search. (You only have to refresh the data every now and then).`
first commit 2023-08-08 05:58:15 +00:00			`#+begin_src python`
Some major changes and improvements 2023-08-08 16:36:48 +00:00			`import json # to parse data`
			`import requests # to get data`
			`import pandas as pd # work with data`
			`import subprocess # for getting access token from pass`
			`import os # to remove file`

Some new comments 2023-08-08 22:28:22 +00:00			`# To start with a fresh file`
Some major changes and improvements 2023-08-08 16:36:48 +00:00			`os.remove(file_name_save)`
first commit 2023-08-08 05:58:15 +00:00
Some major changes and improvements 2023-08-08 16:36:48 +00:00			`url = f'{instance}/api/v1/accounts/{id}/statuses'`
first commit 2023-08-08 05:58:15 +00:00			`params = {`
			`'limit': 40`
			`}`

			`results = []`
Some major changes and improvements 2023-08-08 16:36:48 +00:00			`num_done = 0`
first commit 2023-08-08 05:58:15 +00:00
			`while True:`
Some major changes and improvements 2023-08-08 16:36:48 +00:00			`print(f'{num_done} statuses downloaded')`
			`try:`
			`r = requests.get(url, params=params)`
			`toots = json.loads(r.text)`
			`except:`
			`print("request didn't work")`
first commit 2023-08-08 05:58:15 +00:00
			`if len(toots) == 0:`
			`break`

Some major changes and improvements 2023-08-08 16:36:48 +00:00			`try:`
			`max_id = toots[-1]['id']`
			`params['max_id'] = max_id`
			`except Exception as error:`
			`print("An error occurred with max_id:", error)`
first commit 2023-08-08 05:58:15 +00:00
Some major changes and improvements 2023-08-08 16:36:48 +00:00			`num_done=num_done+40`
first commit 2023-08-08 05:58:15 +00:00
Some major changes and improvements 2023-08-08 16:36:48 +00:00			`try:`
			`df = pd.DataFrame(toots)`
			`df.to_csv(file_name_save, mode='a', index=False)`
			`except Exception as error:`
			`print("An error occurred with df:", error)`
			`num_done=num_done-40`
first commit 2023-08-08 05:58:15 +00:00			`#+end_src`
			`* Use/search data`
Some new comments 2023-08-08 22:28:22 +00:00			`- You can use the csv-file saved in the previous section to search posts.`
first commit 2023-08-08 05:58:15 +00:00			`#+begin_src python`
Some major changes and improvements 2023-08-08 16:36:48 +00:00			`import pandas as pd # work with data`
			`from bs4 import BeautifulSoup # to more easily read the html output`

first commit 2023-08-08 05:58:15 +00:00			`df=pd.read_csv(file_name_save)`

			`query="test"`

			`# Search for words`
Some major changes and improvements 2023-08-08 16:36:48 +00:00			`for index, i in df.iterrows():`
			`if isinstance(i['content'],str):`
			`if query in i['content']:`
			`soup = BeautifulSoup(i['content'], 'html.parser')`
first commit 2023-08-08 05:58:15 +00:00			`readable_text = soup.get_text(separator=' ', strip=True)`
Some major changes and improvements 2023-08-08 16:36:48 +00:00			`print(i['url'])`
			`print(i['created_at'])`
first commit 2023-08-08 05:58:15 +00:00			`print(readable_text)`
			`print("----")`
			`#+end_src`