Init

2021-08-18 18:06:15 +02:00 · 2021-08-18 18:06:15 +02:00 · 6aa29fd017
commit 6aa29fd017
3 changed files with 282 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,2 @@
+token.dat
+venv
--- a/last_page.dat
+++ b/last_page.dat
@ -0,0 +1 @@
+392
--- a/post.py
+++ b/post.py
@ -0,0 +1,279 @@
+#!/home/naiji/mastodon/vocaloiddb-bot/venv/bin/python
+
+import re
+import sys
+import random
+import requests
+
+import os.path as op
+
+from bs4 import BeautifulSoup
+from mastodon import Mastodon
+
+MIKUDB_HEAD = 'http://mikudb.moe/page/'
+MIKUDB_TAIL = '/?s'
+
+VOCADB_HEAD = 'https://vocadb.net/'
+
+def findRandomAlbumUrl(last_page_id) -> str:
+    alarm_counter = 0  
+    while True and alarm_counter < 5:
+        selection_page_id = random.randint(0, int(last_page_id))
+        resp = requests.get(MIKUDB_HEAD + str(selection_page_id) + MIKUDB_TAIL)
+        soup = BeautifulSoup(resp.text, 'lxml')
+        found_album = True
+        album_entries = soup.findAll('div', {'class': 'searchres album-box grid_19'})
+        if len(album_entries) != 0:
+            found_album = True
+            album_entry = random.choice(album_entries)
+            return str(album_entry.findAll('a', href=True)[0]["href"])
+        else:
+            alarm_counter += 1
+
+    return ""
+
+def findAlbumImageUrl(soup) -> str:
+    image_soup = soup.findAll('a', {'rel': 'lightbox'})
+    if len(image_soup) == 0:
+        return ""
+    else:
+        return str(image_soup[0]["href"])
+
+def findAlbumTitle(soup) -> str:
+    title_soup = soup.findAll('h1', {'class': 'album-title'}, text=True)
+    if len(title_soup) == 0:
+        return "UNKOWN TITLE!! somewhat the parser failed... idk, please ping @NaiJi on this post"
+    else:
+        return str(title_soup[0].get_text())
+
+def main():
+
+    with open('last_page.dat', 'r', encoding='utf-8') as file:
+        last_page_id = file.readlines()[0]
+
+    album_url = findRandomAlbumUrl(last_page_id)
+    print(album_url)
+    if album_url == "":
+        return
+
+    # PARSING ACTUAL ALBUM PAGE
+
+    resp = requests.get(album_url)
+    soup = BeautifulSoup(resp.text, 'lxml')
+
+    image_url = findAlbumImageUrl(soup)
+    album_title = findAlbumTitle(soup)
+
+    # PARSING ALBUM INFO BOX
+
+    info_raw = str(soup.find('div', {'class': 'album-box album-infopost panel panel-default'}))
+    info_splits = info_raw.split('\n')
+
+    if len(info_splits) != 1:
+
+        span_token = '</span>'
+        li_token = '</li>'
+        tag_token = 'rel="tag">'
+        a_token = '</a>'
+        href_token = '<a href="'
+        href_end_token = '">'
+
+        # # # ALTERNATIVE NAME
+
+        alternative_name = ''
+        for split in info_splits:
+            if ' names:' in split:
+                begin = split.find(span_token, 0) + len(span_token)
+                end = split.find(li_token, 0)
+                alternative_name = split[begin : end]
+                break
+
+        # # # TYPE
+
+        type_names = []
+        for split in info_splits:
+            if 'Type:' in split:
+                amount = split.count(tag_token)
+                begin = 0
+                end = 0
+                for i in range(amount):
+                    begin = split.find(tag_token, end) + len(tag_token)
+                    end = split.find(a_token, begin)
+                    type_names.append(split[begin : end])
+                break
+
+        # # # RELEASE YEAR
+
+        release_year = ''
+        for split in info_splits:
+            if 'Release Date:' in split:
+                begin = split.find(tag_token, 0) + len(tag_token)
+                end = split.find(a_token, 0)
+                release_year = split[begin : end]
+                break
+
+        # # # VOCALS
+
+        vocal_names = []
+        for split in info_splits:
+            if 'Vocals:' in split:
+                amount = split.count(tag_token)
+                begin = 0
+                end = 0
+                for i in range(amount):
+                    begin = split.find(tag_token, end) + len(tag_token)
+                    end = split.find(a_token, begin)
+                    vocal_names.append(split[begin : end])
+                break
+
+        # # # PRODUCERS
+
+        producers_names = []
+        for split in info_splits:
+            if 'Producer:' in split:
+                amount = split.count(tag_token)
+                begin = 0
+                end = 0
+                for i in range(amount):
+                    begin = split.find(tag_token, end) + len(tag_token)
+                    end = split.find(a_token, begin)
+                    producers_names.append(split[begin : end])
+                break
+
+        # # # GENRES
+
+        genres_names = []
+        for split in info_splits:
+            if 'Genre:' in split:
+                amount = split.count(tag_token)
+                begin = 0
+                end = 0
+                for i in range(amount):
+                    begin = split.find(tag_token, end) + len(tag_token)
+                    end = split.find(a_token, begin)
+                    genres_names.append(split[begin : end])
+                break
+
+        # # # LINKS
+
+        links = []
+        for split in info_splits:
+            if 'Official site' in split:
+                amount = split.count(href_token)
+                begin = 0
+                end = 0
+                for i in range(amount):
+                    begin = split.find(href_token, end) + len(href_token)
+                    end = split.find(href_end_token, begin)
+                    links.append(split[begin : end])
+                break
+
+        print(album_title)
+        print('--------')
+        print(alternative_name)
+        print(type_names)
+        print(vocal_names)
+        print(producers_names)
+        print(genres_names)
+        print(release_year)
+        print(links)
+        print(image_url)
+
+    # SEARCHING FOR YOUTUBE URL
+
+    youtube_url = ''
+
+    video_page_splits = str(soup).split('\n')
+    for split in video_page_splits:
+        if 'youtube' in split:
+            begin = split.find('src="', 0) + len('src="')
+            end = split.find('"', begin)
+            youtube_url = split[begin : end]
+
+    # SEARCHING FOR VOCADB URL
+
+    vocadb_url = ""
+    entry_content_soup = soup.findAll('div', {'class': 'entry-content'})
+    entry_content_splits = str(entry_content_soup).split('\n')
+    for split in entry_content_splits:
+        if 'vocadb.net' in split:
+            begin = split.find('a href="', 0) + len('a href="')
+            end = split.find('">Vo', 0)
+            vocadb_url = split[begin : end]
+
+    # PARSING VOCADB PAGE
+
+    external_links = []
+    vocadb_url = vocadb_url.replace('amp;', '')
+    if len(vocadb_url) > 0:
+        resp = requests.get(vocadb_url)
+        soup = BeautifulSoup(resp.text, 'lxml')
+        if len(soup.findAll('img', {'class': 'coverPic'})) > 0:
+            vocadb_splits = str(soup).split('\n')
+            for split in vocadb_splits:
+                if 'www.nicovideo.jp/watch' in split and len(youtube_url) == 0:
+                    begin = split.find('href="', 0) + len('href="')
+                    end = split.find('">', begin)
+                    youtube_url = split[begin : end]
+                if 'class="extLink"' in split and 'amazon' not in split:
+                    begin = split.find('href="', 0) + len('href="')
+                    end = split.find('" onclick', begin)
+                    external_links.append(split[begin : end])
+
+    print(external_links)
+    print(youtube_url)
+
+    text = "ALBUM:\n" + album_title
+
+    if len(alternative_name) > 0:
+        text += str('\n\nALTERNATIVE TITLES:\n' + alternative_name)
+
+    if len(type_names) > 0:
+        text += '\n\nTYPE:\n'
+        for type_name in type_names:
+            text += (type_name + ' ')
+
+    if len(vocal_names) > 0:
+        text += '\n\nVOCAL:\n'
+        for vocal_name in vocal_names:
+            text += (vocal_name + ' ')
+
+    if len(producers_names) > 0:
+        text += '\n\nPRODUCING:\n'
+        for producer_name in producers_names:
+            text += (producer_name + ' ')
+
+    if len(genres_names) > 0:
+        text += '\n\nGENRE:\n'
+        for genre_name in genres_names:
+            text += (genre_name + ' ')
+
+    if len(release_year) > 0:
+        text += str('\n\nRELEASED:\n' + release_year)
+
+    if len(youtube_url) > 0:
+        text += str('\n\nVIDEO: \n' + youtube_url)
+
+    if len(external_links) == 0:
+        external_links = links
+
+    if len(external_links) > 0:
+        text += '\n\nLINKS: \n'
+        for external_link in external_links:
+            text += (external_link + '\n\n')
+
+    mastodon = Mastodon(
+        access_token = 'token.dat',
+        api_base_url = 'https://udongein.xyz/'
+    )
+
+    fformat = op.splitext(image_url)[1][1:]
+    if (fformat == 'jpg'):
+        fformat = 'jpeg'
+
+    image_media = mastodon.media_post(requests.get(image_url).content, f'image/{fformat}')
+
+    mastodon.status_post(text, media_ids=[image_media], visibility='unlisted', sensitive=False)
+
+if __name__ == '__main__':
+    sys.exit(main())