Some major changes and improvements
This commit is contained in:
parent
e3324bbb60
commit
4a1259599c
|
@ -1,63 +1,87 @@
|
||||||
* Import
|
* Initialize
|
||||||
#+begin_src python
|
#+begin_src python
|
||||||
import json # to parse data
|
import json # to parse data
|
||||||
import requests # to get data
|
import requests # to get data
|
||||||
import pandas as pd # work with data
|
|
||||||
from mastodon import Mastodon # to get the user id
|
# get user id
|
||||||
from datetime import date # to get the current date
|
instance = "https://social.edu.nl"
|
||||||
import subprocess # for getting access token from pass
|
username = "mishavelthuis"
|
||||||
from bs4 import BeautifulSoup # to more easily read the html output
|
id = json.loads(requests.get(f"{instance}/api/v1/accounts/lookup?acct={username}").text)['id']
|
||||||
#+end_src
|
|
||||||
|
# get current date
|
||||||
|
current_date = date.today()
|
||||||
|
|
||||||
|
# Create filename for data output
|
||||||
|
#current_dir="/".join(inspect.getfile(inspect.currentframe()).split("/")[:-1])
|
||||||
|
download_dir=os.path.expanduser("~/Downloads")
|
||||||
|
file_name_save=f'{download_dir}/mydata_{current_date}_{username}.csv'
|
||||||
|
#+end_src
|
||||||
* Get/refresh data
|
* Get/refresh data
|
||||||
- I used [[https://jrashford.com/2023/02/13/how-to-scrape-mastodon-timelines-using-python-and-pandas/][this]] setup.
|
- I used [[https://jrashford.com/2023/02/13/how-to-scrape-mastodon-timelines-using-python-and-pandas/][this]] setup.
|
||||||
- Only have to be refreshed (run) every now and then
|
- Only have to be refreshed (run) every now and then
|
||||||
#+begin_src python
|
#+begin_src python
|
||||||
# Get user id
|
import json # to parse data
|
||||||
instance = "https://social.edu.nl"
|
import requests # to get data
|
||||||
username = "mishavelthuis"
|
import pandas as pd # work with data
|
||||||
id = json.loads(requests.get("https://social.edu.nl/api/v1/accounts/lookup?acct=mishavelthuis").text)['id']
|
from datetime import date # to get the current date
|
||||||
|
import subprocess # for getting access token from pass
|
||||||
|
import os # to remove file
|
||||||
|
|
||||||
URL = f'{instance}/api/v1/accounts/{id}/statuses'
|
# To not append to existing file
|
||||||
|
os.remove(file_name_save)
|
||||||
|
|
||||||
|
url = f'{instance}/api/v1/accounts/{id}/statuses'
|
||||||
params = {
|
params = {
|
||||||
'limit': 40
|
'limit': 40
|
||||||
}
|
}
|
||||||
|
|
||||||
results = []
|
results = []
|
||||||
|
num_done = 0
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
r = requests.get(URL, params=params)
|
print(f'{num_done} statuses downloaded')
|
||||||
toots = json.loads(r.text)
|
try:
|
||||||
|
r = requests.get(url, params=params)
|
||||||
|
toots = json.loads(r.text)
|
||||||
|
except:
|
||||||
|
print("request didn't work")
|
||||||
|
|
||||||
if len(toots) == 0:
|
if len(toots) == 0:
|
||||||
break
|
break
|
||||||
|
|
||||||
results.extend(toots)
|
try:
|
||||||
|
max_id = toots[-1]['id']
|
||||||
max_id = toots[-1]['id']
|
params['max_id'] = max_id
|
||||||
params['max_id'] = max_id
|
except Exception as error:
|
||||||
|
print("An error occurred with max_id:", error)
|
||||||
df = pd.DataFrame(results)
|
|
||||||
|
|
||||||
current_date = date.today()
|
num_done=num_done+40
|
||||||
current_dir="/".join(inspect.getfile(inspect.currentframe()).split("/")[:-1])
|
|
||||||
file_name_save=f'{current_dir}/mydata_{current_date}_{username}.csv'
|
|
||||||
df.to_csv(file_name_save, index=False)
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
df = pd.DataFrame(toots)
|
||||||
|
df.to_csv(file_name_save, mode='a', index=False)
|
||||||
|
except Exception as error:
|
||||||
|
print("An error occurred with df:", error)
|
||||||
|
num_done=num_done-40
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
||||||
* Use/search data
|
* Use/search data
|
||||||
- You don't have to load all data for every search.
|
- You don't have to load all data for every search.
|
||||||
#+begin_src python
|
#+begin_src python
|
||||||
|
import pandas as pd # work with data
|
||||||
|
from bs4 import BeautifulSoup # to more easily read the html output
|
||||||
|
|
||||||
df=pd.read_csv(file_name_save)
|
df=pd.read_csv(file_name_save)
|
||||||
|
|
||||||
query="test"
|
query="test"
|
||||||
|
|
||||||
# Search for words
|
# Search for words
|
||||||
for i in df['content']:
|
for index, i in df.iterrows():
|
||||||
if isinstance(i,str):
|
if isinstance(i['content'],str):
|
||||||
if query in i:
|
if query in i['content']:
|
||||||
soup = BeautifulSoup(i, 'html.parser')
|
soup = BeautifulSoup(i['content'], 'html.parser')
|
||||||
readable_text = soup.get_text(separator=' ', strip=True)
|
readable_text = soup.get_text(separator=' ', strip=True)
|
||||||
|
print(i['url'])
|
||||||
|
print(i['created_at'])
|
||||||
print(readable_text)
|
print(readable_text)
|
||||||
print("----")
|
print("----")
|
||||||
#+end_src
|
#+end_src
|
||||||
|
|
Loading…
Reference in a new issue