diff --git a/archive.py b/archive.py index b7e1416..84799b6 100644 --- a/archive.py +++ b/archive.py @@ -1,4 +1,5 @@ import argparse +import asyncio import sys import dataset @@ -7,6 +8,8 @@ import toml import vk_api from mastodon import Mastodon +import utils + parser = argparse.ArgumentParser() parser.add_argument("-c", "--config", help="Config path") parser.add_argument("-g", "--group", help="VK group to archive") @@ -28,7 +31,6 @@ vk = vk_session.get_api() tools = vk_api.VkTools(vk_session) -print(f"Downloading list of posts in group {args.group}...") parsed_posts = [] db = dataset.connect('sqlite:///database.db') @@ -38,61 +40,36 @@ uploaded_posts = db['uploaded_posts'] group_last_post_count = last_post_count_table.find_one(group=args.group) posts_raw = {} -if group_last_post_count == None: +print(f"Downloading list of posts in group {args.group}...") +if group_last_post_count is None: # download full wall posts_raw = tools.get_all('wall.get', 100, {'domain': args.group}) else: - # download only neccessary posts from vk - posts_raw["items"] = [] + # download only necessary posts from vk last_post_count = group_last_post_count["count"] - p_tmp = vk.wall.get(domain=args.group, count=1) - has_pinned_post = p_tmp["items"][0].get("is_pinned") == 1 - current_count = p_tmp["count"] - if current_count == last_post_count: - print("Nothing to do, quitting...") - sys.exit(0) - posts_raw["count"] = current_count - download_count = current_count - last_post_count - download_offset = 0 - if has_pinned_post: - # skip pinned post, cuz it appears first in the list - download_offset += 1 - while download_count > 0: - to_download = 0 - if download_count - 100 < 0: - to_download = download_count - download_count = 0 - else: - to_download = 100 - download_count -= 100 - download_offset += 100 - posts_raw_tmp = vk.wall.get(domain=args.group, offset=download_offset, count=to_download) - posts_raw["items"].extend(posts_raw_tmp["items"]) - last_post_count += len(posts_raw_tmp["items"]) + posts_raw["items"] = asyncio.run(utils.download_posts_incrementally(vk, args.group, last_post_count)) posts = posts_raw["items"] for p in posts: - attachments = p.get("attachments") - if attachments == None: + if uploaded_posts.find_one(group=args.group, post_id=p["id"]) is not None: + print(f"Post {p['id']} already has been uploaded, skipping it...") continue - parsed_post = {} - parsed_post["id"] = p["id"] - parsed_post["text"] = p["text"] - parsed_post["date"] = p["date"] - parsed_post["pinned"] = p.get("is_pinned") == 1 - parsed_post["attachments"] = [] - for a in attachments: - if a["type"] == "photo": - # get the biggest resolution of the photo - a["photo"]["sizes"].sort(key=lambda e: e["height"], reverse=True) - parsed_post["attachments"].append(a["photo"]["sizes"][0]["url"]) + + attachments = p.get("attachments") + parsed_post = {"id": p["id"], "text": p["text"], "date": p["date"], "pinned": p.get("is_pinned") == 1, + "attachments": []} + if attachments is not None: + for a in attachments: + if a["type"] == "photo": + # get the biggest resolution of the photo + a["photo"]["sizes"].sort(key=lambda e: e["height"], reverse=True) + parsed_post["attachments"].append(a["photo"]["sizes"][0]["url"]) parsed_posts.append(parsed_post) parsed_posts.sort(key=lambda e: e["date"]) print("Uploading posts to the Fediverse...") -if group_last_post_count == None: - group_last_post_count = {} - group_last_post_count['count'] = 0 +if group_last_post_count is None: + group_last_post_count = {'count': 0, 'group': args.group} c = 0 for p in parsed_posts: uploaded_media = [] @@ -108,3 +85,5 @@ for p in parsed_posts: last_post_count_table.upsert(group_last_post_count, ['group']) c += 1 print(f"Progress: {c}/{len(parsed_posts)}") + +db.close() diff --git a/bot.py b/bot.py new file mode 100644 index 0000000..1acc6e7 --- /dev/null +++ b/bot.py @@ -0,0 +1,109 @@ +import argparse +import asyncio +import queue +import threading +import time + +import dataset +import requests +import toml +import vk_api +from mastodon import Mastodon + +import utils + +parser = argparse.ArgumentParser() +parser.add_argument("-c", "--config", help="Config path") +args = parser.parse_args() + +config = toml.load(args.config) + +vk_session = vk_api.VkApi(token=config["vk"]["access_token"]) +vk = vk_session.get_api() + +mastodon_clients = {} +bots_longpoll = {} +bot_threads = {} +q = queue.Queue() + +db = dataset.connect('sqlite:///database.db') +uploaded_posts = db['uploaded_posts'] + + +def bot_loop(): + while True: + # get new post from queue + post_chunk = q.get(block=True, timeout=None) + + m = mastodon_clients.get(post_chunk["group"]) + if m is None: + print(f"couldn't find corresponding mastodon client for group {post_chunk['group']}") + continue + + for post in post_chunk["items"]: + attachments = post.get("attachments") + + parsed_post = {"id": post["id"], "text": post["text"], "date": post["date"], + "pinned": post.get("is_pinned") == 1, + "attachments": []} + if attachments is not None: + for a in attachments: + if a["type"] == "photo": + # get the biggest resolution of the photo + a["photo"]["sizes"].sort(key=lambda e: e["height"], reverse=True) + parsed_post["attachments"].append(a["photo"]["sizes"][0]["url"]) + + uploaded_media = [] + for i in parsed_post["attachments"]: + resp = requests.get(i) + m = m.media_post(resp.content, mime_type='image/jpeg') + uploaded_media.append(m) + toot = m.status_post(parsed_post["text"], media_ids=uploaded_media, visibility='public') + if parsed_post['pinned']: + m.status_pin(toot['id']) + uploaded_posts.insert({'group': post_chunk["group"], 'post_id': post['id']}) + + group_last_post_count = db['last_post_count'].find_one(group=post_chunk["group"]) + if group_last_post_count is None: + group_last_post_count = {'count': 0, 'group': post_chunk["group"]} # FIXME this shouldn't happen + group_last_post_count['count'] += 1 + db['last_post_count'].upsert(group_last_post_count, ['group']) + print(f"Uploaded post {post['id']} for group {post_chunk['group']} successfully!") + + +async def listen_new_posts(): + tasks = [] + for group in config["mastodon"]: + group_last_post_count = db['last_post_count'].find_one(group=group) + if group_last_post_count is None: + group_last_post_count = {'count': 0, + 'group': group} # FIXME need to execute archive.py code for full downloading of wall on new group + # FIXME this shouldn't happen + tasks.append(utils.download_posts_incrementally(vk, group, group_last_post_count['count'])) + new_post_chunks = await asyncio.gather(*tasks) + for chunk in new_post_chunks: + if len(chunk["items"]) > 0: + q.put(chunk, block=True, timeout=None) + +for k in config["mastodon"]: + mastodon_clients[k] = Mastodon( + access_token=config["mastodon"][k]["access_token"], + api_base_url=config["mastodon"][k]["instance"] + ) + print(k) + + t = threading.Thread(target=bot_loop) + t.start() + bot_threads[k] = t + +print("Bot has been set up, listening events...") + + +while True: + try: + asyncio.run(listen_new_posts()) + time.sleep(5) + except KeyboardInterrupt: + db.close() + break + diff --git a/utils.py b/utils.py new file mode 100644 index 0000000..76dad8e --- /dev/null +++ b/utils.py @@ -0,0 +1,30 @@ +from vk_api import VkApi + + +async def download_posts_incrementally(vk: VkApi, group_domain: str, last_post_count: int): + posts_raw = {"items": [], "group": group_domain} + p_tmp = vk.wall.get(domain=group_domain, count=1) + has_pinned_post = False + if len(p_tmp["items"]) > 0: + has_pinned_post = p_tmp["items"][0].get("is_pinned") == 1 + current_count = p_tmp["count"] + if current_count == last_post_count: + return posts_raw + download_count = current_count - last_post_count + download_offset = 0 + if has_pinned_post: + # skip pinned post, cuz it appears first in the list + download_offset += 1 + while download_count > 0: + to_download = 0 + if download_count - 100 < 0: + to_download = download_count + download_count = 0 + else: + to_download = 100 + download_count -= 100 + download_offset += 100 + posts_raw_tmp = vk.wall.get(domain=group_domain, offset=download_offset, count=to_download) + posts_raw["items"].extend(posts_raw_tmp["items"]) + last_post_count += len(posts_raw_tmp["items"]) + return posts_raw