From 6aa29fd0174aa31d529b21cc2e89bfbdd4599e89 Mon Sep 17 00:00:00 2001 From: NaiJi Date: Wed, 18 Aug 2021 18:06:15 +0200 Subject: [PATCH] Init --- .gitignore | 2 + last_page.dat | 1 + post.py | 279 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 282 insertions(+) create mode 100644 .gitignore create mode 100644 last_page.dat create mode 100755 post.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..76fd499 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +token.dat +venv diff --git a/last_page.dat b/last_page.dat new file mode 100644 index 0000000..bd03e26 --- /dev/null +++ b/last_page.dat @@ -0,0 +1 @@ +392 diff --git a/post.py b/post.py new file mode 100755 index 0000000..59fd95b --- /dev/null +++ b/post.py @@ -0,0 +1,279 @@ +#!/home/naiji/mastodon/vocaloiddb-bot/venv/bin/python + +import re +import sys +import random +import requests + +import os.path as op + +from bs4 import BeautifulSoup +from mastodon import Mastodon + +MIKUDB_HEAD = 'http://mikudb.moe/page/' +MIKUDB_TAIL = '/?s' + +VOCADB_HEAD = 'https://vocadb.net/' + +def findRandomAlbumUrl(last_page_id) -> str: + alarm_counter = 0 + while True and alarm_counter < 5: + selection_page_id = random.randint(0, int(last_page_id)) + resp = requests.get(MIKUDB_HEAD + str(selection_page_id) + MIKUDB_TAIL) + soup = BeautifulSoup(resp.text, 'lxml') + found_album = True + album_entries = soup.findAll('div', {'class': 'searchres album-box grid_19'}) + if len(album_entries) != 0: + found_album = True + album_entry = random.choice(album_entries) + return str(album_entry.findAll('a', href=True)[0]["href"]) + else: + alarm_counter += 1 + + return "" + +def findAlbumImageUrl(soup) -> str: + image_soup = soup.findAll('a', {'rel': 'lightbox'}) + if len(image_soup) == 0: + return "" + else: + return str(image_soup[0]["href"]) + +def findAlbumTitle(soup) -> str: + title_soup = soup.findAll('h1', {'class': 'album-title'}, text=True) + if len(title_soup) == 0: + return "UNKOWN TITLE!! somewhat the parser failed... idk, please ping @NaiJi on this post" + else: + return str(title_soup[0].get_text()) + +def main(): + + with open('last_page.dat', 'r', encoding='utf-8') as file: + last_page_id = file.readlines()[0] + + album_url = findRandomAlbumUrl(last_page_id) + print(album_url) + if album_url == "": + return + + # PARSING ACTUAL ALBUM PAGE + + resp = requests.get(album_url) + soup = BeautifulSoup(resp.text, 'lxml') + + image_url = findAlbumImageUrl(soup) + album_title = findAlbumTitle(soup) + + # PARSING ALBUM INFO BOX + + info_raw = str(soup.find('div', {'class': 'album-box album-infopost panel panel-default'})) + info_splits = info_raw.split('\n') + + if len(info_splits) != 1: + + span_token = '' + li_token = '' + tag_token = 'rel="tag">' + a_token = '' + href_token = '' + + # # # ALTERNATIVE NAME + + alternative_name = '' + for split in info_splits: + if ' names:' in split: + begin = split.find(span_token, 0) + len(span_token) + end = split.find(li_token, 0) + alternative_name = split[begin : end] + break + + # # # TYPE + + type_names = [] + for split in info_splits: + if 'Type:' in split: + amount = split.count(tag_token) + begin = 0 + end = 0 + for i in range(amount): + begin = split.find(tag_token, end) + len(tag_token) + end = split.find(a_token, begin) + type_names.append(split[begin : end]) + break + + # # # RELEASE YEAR + + release_year = '' + for split in info_splits: + if 'Release Date:' in split: + begin = split.find(tag_token, 0) + len(tag_token) + end = split.find(a_token, 0) + release_year = split[begin : end] + break + + # # # VOCALS + + vocal_names = [] + for split in info_splits: + if 'Vocals:' in split: + amount = split.count(tag_token) + begin = 0 + end = 0 + for i in range(amount): + begin = split.find(tag_token, end) + len(tag_token) + end = split.find(a_token, begin) + vocal_names.append(split[begin : end]) + break + + # # # PRODUCERS + + producers_names = [] + for split in info_splits: + if 'Producer:' in split: + amount = split.count(tag_token) + begin = 0 + end = 0 + for i in range(amount): + begin = split.find(tag_token, end) + len(tag_token) + end = split.find(a_token, begin) + producers_names.append(split[begin : end]) + break + + # # # GENRES + + genres_names = [] + for split in info_splits: + if 'Genre:' in split: + amount = split.count(tag_token) + begin = 0 + end = 0 + for i in range(amount): + begin = split.find(tag_token, end) + len(tag_token) + end = split.find(a_token, begin) + genres_names.append(split[begin : end]) + break + + # # # LINKS + + links = [] + for split in info_splits: + if 'Official site' in split: + amount = split.count(href_token) + begin = 0 + end = 0 + for i in range(amount): + begin = split.find(href_token, end) + len(href_token) + end = split.find(href_end_token, begin) + links.append(split[begin : end]) + break + + print(album_title) + print('--------') + print(alternative_name) + print(type_names) + print(vocal_names) + print(producers_names) + print(genres_names) + print(release_year) + print(links) + print(image_url) + + # SEARCHING FOR YOUTUBE URL + + youtube_url = '' + + video_page_splits = str(soup).split('\n') + for split in video_page_splits: + if 'youtube' in split: + begin = split.find('src="', 0) + len('src="') + end = split.find('"', begin) + youtube_url = split[begin : end] + + # SEARCHING FOR VOCADB URL + + vocadb_url = "" + entry_content_soup = soup.findAll('div', {'class': 'entry-content'}) + entry_content_splits = str(entry_content_soup).split('\n') + for split in entry_content_splits: + if 'vocadb.net' in split: + begin = split.find('a href="', 0) + len('a href="') + end = split.find('">Vo', 0) + vocadb_url = split[begin : end] + + # PARSING VOCADB PAGE + + external_links = [] + vocadb_url = vocadb_url.replace('amp;', '') + if len(vocadb_url) > 0: + resp = requests.get(vocadb_url) + soup = BeautifulSoup(resp.text, 'lxml') + if len(soup.findAll('img', {'class': 'coverPic'})) > 0: + vocadb_splits = str(soup).split('\n') + for split in vocadb_splits: + if 'www.nicovideo.jp/watch' in split and len(youtube_url) == 0: + begin = split.find('href="', 0) + len('href="') + end = split.find('">', begin) + youtube_url = split[begin : end] + if 'class="extLink"' in split and 'amazon' not in split: + begin = split.find('href="', 0) + len('href="') + end = split.find('" onclick', begin) + external_links.append(split[begin : end]) + + print(external_links) + print(youtube_url) + + text = "ALBUM:\n" + album_title + + if len(alternative_name) > 0: + text += str('\n\nALTERNATIVE TITLES:\n' + alternative_name) + + if len(type_names) > 0: + text += '\n\nTYPE:\n' + for type_name in type_names: + text += (type_name + ' ') + + if len(vocal_names) > 0: + text += '\n\nVOCAL:\n' + for vocal_name in vocal_names: + text += (vocal_name + ' ') + + if len(producers_names) > 0: + text += '\n\nPRODUCING:\n' + for producer_name in producers_names: + text += (producer_name + ' ') + + if len(genres_names) > 0: + text += '\n\nGENRE:\n' + for genre_name in genres_names: + text += (genre_name + ' ') + + if len(release_year) > 0: + text += str('\n\nRELEASED:\n' + release_year) + + if len(youtube_url) > 0: + text += str('\n\nVIDEO: \n' + youtube_url) + + if len(external_links) == 0: + external_links = links + + if len(external_links) > 0: + text += '\n\nLINKS: \n' + for external_link in external_links: + text += (external_link + '\n\n') + + mastodon = Mastodon( + access_token = 'token.dat', + api_base_url = 'https://udongein.xyz/' + ) + + fformat = op.splitext(image_url)[1][1:] + if (fformat == 'jpg'): + fformat = 'jpeg' + + image_media = mastodon.media_post(requests.get(image_url).content, f'image/{fformat}') + + mastodon.status_post(text, media_ids=[image_media], visibility='unlisted', sensitive=False) + +if __name__ == '__main__': + sys.exit(main())