import requests import socket import socks import json import pymongo import time import urllib.parse from datetime import datetime import signal import sys def signal_handling(signum, frame): print("you chose to end the program") sys.exit() signal.signal(signal.SIGINT, signal_handling) socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', 1080) socket.socket = socks.socksocket def get_database(): # Provide the mongodb atlas url to connect python to mongodb using pymongo CONNECTION_STRING = "mongodb://amir:1234@localhost:27017/insta" # Create a connection using MongoClient. You can import MongoClient or use pymongo.MongoClient client = pymongo.MongoClient(CONNECTION_STRING) # Create the database for our example (we will use the same database throughout the tutorial return client def get_comments(min_id: str, post_pk): url = f"https://www.instagram.com/api/v1/media/{post_pk}/comments/?can_support_threading=true&min_id={urllib.parse.quote(str(min_id).encode())}" if (min_id.startswith("permalink")): url = f"https://www.instagram.com/api/v1/media/{post_pk}/comments/?can_support_threading=true&permalink_enabled=false" payload = {} headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0', 'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'X-CSRFToken': 'SyRjkhE11IZJJzEjFYNFOCAkbHObhpp6', 'X-IG-App-ID': '936619743392459', 'X-ASBD-ID': '198387', 'X-IG-WWW-Claim': 'hmac.AR3dSNyXs6hL8Kd5CQUR--JX03KHuY0enjoxwEaUbJ60jFfO', 'X-Requested-With': 'XMLHttpRequest', 'DNT': '1', 'Connection': 'keep-alive', 'Referer': 'https://www.instagram.com/p/Cp7R84moFix/', 'Cookie': 'dpr=1.3636363636363635; ig_did=C7B7A9D0-EAFA-4B62-8AFF-8217A91EA3B8; datr=xKsVZPIxlYi2YshNs71hgDiE; csrftoken=SyRjkhE11IZJJzEjFYNFOCAkbHObhpp6; mid=ZBWrxgALAAEC4bnKm9IP1TZAOBVU; ig_nrcb=1; rur="ASH\\05458490635989\\0541710690982:01f7d99273b41cbcfaffa5a5edd22f59b954c960e0e8a5576dd95f63fe741570bc97a123"; ds_user_id=58490635989; sessionid=58490635989%3Asm9z3FQSns1adH%3A5%3AAYcZBsVcUAY3bASwyC9goX-5mMD89Zxnpbjt29E0ug', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', 'TE': 'trailers' } response = requests.request("GET", url, headers=headers, data=payload) data = response.json() return data # client = get_database() # db = client['insta'] # cls = db.list_collection_names() # return nextcursor target_account_name = "bartar.academy" target_classification = "seo-web" if __name__ == "__main__": try: i = 0 postsfile = open(f"{target_account_name}.json", 'r', encoding="utf8") posts = json.loads(postsfile.read()) for post in posts['items']: nextcursor = 'permalink_enabled=false' comments = [] i += 1 while True: data = get_comments(nextcursor, post['pk']) nextcursor = data['next_min_id'] if 'next_min_id' in data else None comments += data['comments'] if nextcursor is None or 'cached_comments_cursor' not in nextcursor: current = datetime.now() f = open( f"{i}-{target_classification}-{target_account_name}-{current.strftime('%Y-%m-%d-T-%H-%M-%S')}-{post['pk']}-scrapped.json", "w") f.write(json.dumps(comments)) f.close() print(len(comments)) break time.sleep(3) except: pass