#!/home/naiji/mastodon/vocaloiddb-bot/venv/bin/python import re import sys import random import requests import os.path as op from bs4 import BeautifulSoup from mastodon import Mastodon MIKUDB_HEAD = 'http://mikudb.moe/page/' MIKUDB_TAIL = '/?s' VOCADB_HEAD = 'https://vocadb.net/' def findRandomAlbumUrl(last_page_id) -> str: alarm_counter = 0 while True and alarm_counter < 5: selection_page_id = random.randint(0, int(last_page_id)) resp = requests.get(MIKUDB_HEAD + str(selection_page_id) + MIKUDB_TAIL) soup = BeautifulSoup(resp.text, 'lxml') found_album = True album_entries = soup.findAll('div', {'class': 'searchres album-box grid_19'}) if len(album_entries) != 0: found_album = True album_entry = random.choice(album_entries) return str(album_entry.findAll('a', href=True)[0]["href"]) else: alarm_counter += 1 return "" def findAlbumImageUrl(soup) -> str: image_soup = soup.findAll('a', {'rel': 'lightbox'}) if len(image_soup) == 0: return "" else: return str(image_soup[0]["href"]) def findAlbumTitle(soup) -> str: title_soup = soup.findAll('h1', {'class': 'album-title'}, text=True) if len(title_soup) == 0: return "UNKOWN TITLE!! somewhat the parser failed... idk, please ping @NaiJi on this post" else: return str(title_soup[0].get_text()) def main(): with open('last_page.dat', 'r', encoding='utf-8') as file: last_page_id = file.readlines()[0] album_url = findRandomAlbumUrl(last_page_id) print(album_url) if album_url == "": return # PARSING ACTUAL ALBUM PAGE resp = requests.get(album_url) soup = BeautifulSoup(resp.text, 'lxml') image_url = findAlbumImageUrl(soup) album_title = findAlbumTitle(soup) # PARSING ALBUM INFO BOX info_raw = str(soup.find('div', {'class': 'album-box album-infopost panel panel-default'})) info_splits = info_raw.split('\n') if len(info_splits) != 1: span_token = '' li_token = '' tag_token = 'rel="tag">' a_token = '' href_token = '' # # # ALTERNATIVE NAME alternative_name = '' for split in info_splits: if ' names:' in split: begin = split.find(span_token, 0) + len(span_token) end = split.find(li_token, 0) alternative_name = split[begin : end] break # # # TYPE type_names = [] for split in info_splits: if 'Type:' in split: amount = split.count(tag_token) begin = 0 end = 0 for i in range(amount): begin = split.find(tag_token, end) + len(tag_token) end = split.find(a_token, begin) type_names.append(split[begin : end]) break # # # RELEASE YEAR release_year = '' for split in info_splits: if 'Release Date:' in split: begin = split.find(tag_token, 0) + len(tag_token) end = split.find(a_token, 0) release_year = split[begin : end] break # # # VOCALS vocal_names = [] for split in info_splits: if 'Vocals:' in split: amount = split.count(tag_token) begin = 0 end = 0 for i in range(amount): begin = split.find(tag_token, end) + len(tag_token) end = split.find(a_token, begin) vocal_names.append(split[begin : end]) break # # # PRODUCERS producers_names = [] for split in info_splits: if 'Producer:' in split: amount = split.count(tag_token) begin = 0 end = 0 for i in range(amount): begin = split.find(tag_token, end) + len(tag_token) end = split.find(a_token, begin) producers_names.append(split[begin : end]) break # # # GENRES genres_names = [] for split in info_splits: if 'Genre:' in split: amount = split.count(tag_token) begin = 0 end = 0 for i in range(amount): begin = split.find(tag_token, end) + len(tag_token) end = split.find(a_token, begin) genres_names.append(split[begin : end]) break # # # LINKS links = [] for split in info_splits: if 'Official site' in split: amount = split.count(href_token) begin = 0 end = 0 for i in range(amount): begin = split.find(href_token, end) + len(href_token) end = split.find(href_end_token, begin) links.append(split[begin : end]) break print(album_title) print('--------') print(alternative_name) print(type_names) print(vocal_names) print(producers_names) print(genres_names) print(release_year) print(links) print(image_url) # SEARCHING FOR YOUTUBE URL youtube_url = '' video_page_splits = str(soup).split('\n') for split in video_page_splits: if 'youtube' in split: begin = split.find('src="', 0) + len('src="') end = split.find('"', begin) youtube_url = split[begin : end] # SEARCHING FOR VOCADB URL vocadb_url = "" entry_content_soup = soup.findAll('div', {'class': 'entry-content'}) entry_content_splits = str(entry_content_soup).split('\n') for split in entry_content_splits: if 'vocadb.net' in split: begin = split.find('a href="', 0) + len('a href="') end = split.find('">Vo', 0) vocadb_url = split[begin : end] # PARSING VOCADB PAGE external_links = [] vocadb_url = vocadb_url.replace('amp;', '') if len(vocadb_url) > 0: resp = requests.get(vocadb_url) soup = BeautifulSoup(resp.text, 'lxml') if len(soup.findAll('img', {'class': 'coverPic'})) > 0: vocadb_splits = str(soup).split('\n') for split in vocadb_splits: if 'www.nicovideo.jp/watch' in split and len(youtube_url) == 0: begin = split.find('href="', 0) + len('href="') end = split.find('">', begin) youtube_url = split[begin : end] if 'class="extLink"' in split and 'amazon' not in split: begin = split.find('href="', 0) + len('href="') end = split.find('" onclick', begin) external_links.append(split[begin : end]) print(external_links) print(youtube_url) text = "ALBUM:\n" + album_title if len(alternative_name) > 0: text += str('\n\nALTERNATIVE TITLES:\n' + alternative_name) if len(type_names) > 0: text += '\n\nTYPE:\n' for type_name in type_names: text += (type_name + '; ') if len(vocal_names) > 0: text += '\n\nVOCAL:\n' for vocal_name in vocal_names: text += (vocal_name + '; ') if len(producers_names) > 0: text += '\n\nPRODUCING:\n' for producer_name in producers_names: text += (producer_name + '; ') if len(genres_names) > 0: text += '\n\nGENRE:\n' for genre_name in genres_names: text += (genre_name + '; ') if len(release_year) > 0: text += str('\n\nRELEASED:\n' + release_year) if len(youtube_url) > 0: text += str('\n\nVIDEO: \n' + youtube_url) text += str('\n\nMIKUDB: \n' + album_url) if len(external_links) == 0: external_links = links if len(external_links) > 0: text += '\n\nLINKS: \n' for external_link in external_links: text += (external_link + '\n\n') mastodon = Mastodon( access_token = 'token.dat', api_base_url = 'https://udongein.xyz/' ) fformat = op.splitext(image_url)[1][1:] if (fformat == 'jpg'): fformat = 'jpeg' image_media = mastodon.media_post(requests.get(image_url).content, f'image/{fformat}') mastodon.status_post(text, media_ids=[image_media], visibility='unlisted', sensitive=False) if __name__ == '__main__': sys.exit(main())