import json import time import requests import socket import socks import urllib.parse from datetime import datetime socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', 1080) socket.socket = socks.socksocket tag_name = 'برنامه_نویسی' def getTopPosts(): headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0', 'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'X-CSRFToken': '3rSOOhL1DSUQT2czImiCbMevgLgGx6kZ', 'X-IG-App-ID': '936619743392459', 'X-ASBD-ID': '198387', 'X-IG-WWW-Claim': 'hmac.AR0OphPNsUYX-i9oXNzfh6JF3hDx_3eDAjxFFjWI0DYyKsO6', 'X-Requested-With': 'XMLHttpRequest', 'Connection': 'keep-alive', 'Referer': 'https://www.instagram.com/explore/tags/%D8%A8%D8%B1%D9%86%D8%A7%D9%85%D9%87_%D9%86%D9%88%DB%8C%D8%B3%DB%8C/', 'Cookie': 'ig_did=ACEC5413-54FC-4EFA-B0AF-C5F43D60BB8A; datr=usoZZDUz2YHk2UAtkauI2jWW; mid=ZBnKwwALAAEW_HOHO2zCuHUsHbG8; ig_nrcb=1; sessionid=58527153666%3AszpE7BNZUXL1Z9%3A7%3AAYelK_Wp6wCVw9U89yijo3VXXbDpn4zdD9Q-rOXy-Q; ds_user_id=58527153666; csrftoken=3rSOOhL1DSUQT2czImiCbMevgLgGx6kZ; dpr=1.25; shbid="16293\\05458527153666\\0541712933531:01f7f3123fe1f37549d037b1a1411e119e8d36295768b16f0d4189989400a3f6c4719399"; shbts="1681397531\\05458527153666\\0541712933531:01f78bd56e3232a7c8e15ca38b2d311e7d086901d832c9e272814290edb76152436a31dd"; rur="LDC\\05458527153666\\0541712933636:01f75496bea7dd870ec16fa6f07ff96b9d518c7d09d2d1b3c6b84cdcbd06689e2ae5bc1e"', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', } params = { 'tag_name': tag_name, } response = requests.get( 'https://www.instagram.com/api/v1/tags/web_info/', params=params, headers=headers) return response.json() def getComments(min_id: str, post_pk): try: url = f"https://www.instagram.com/api/v1/media/{post_pk}/comments/?can_support_threading=true&min_id={urllib.parse.quote(str(min_id).encode())}" if (min_id.startswith("permalink")): url = f"https://www.instagram.com/api/v1/media/{post_pk}/comments/?can_support_threading=true&permalink_enabled=false" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0', 'Accept': '*/*', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'X-CSRFToken': '3rSOOhL1DSUQT2czImiCbMevgLgGx6kZ', 'X-IG-App-ID': '936619743392459', 'X-ASBD-ID': '198387', 'X-IG-WWW-Claim': 'hmac.AR0OphPNsUYX-i9oXNzfh6JF3hDx_3eDAjxFFjWI0DYyKsO6', 'X-Requested-With': 'XMLHttpRequest', 'Connection': 'keep-alive', 'Referer': 'https://www.instagram.com/p/CqkUyB6ICbR/', 'Cookie': 'ig_did=ACEC5413-54FC-4EFA-B0AF-C5F43D60BB8A; datr=usoZZDUz2YHk2UAtkauI2jWW; mid=ZBnKwwALAAEW_HOHO2zCuHUsHbG8; ig_nrcb=1; sessionid=58527153666%3AszpE7BNZUXL1Z9%3A7%3AAYelK_Wp6wCVw9U89yijo3VXXbDpn4zdD9Q-rOXy-Q; ds_user_id=58527153666; csrftoken=3rSOOhL1DSUQT2czImiCbMevgLgGx6kZ; dpr=1.25; shbid="16293\\05458527153666\\0541712933531:01f7f3123fe1f37549d037b1a1411e119e8d36295768b16f0d4189989400a3f6c4719399"; shbts="1681397531\\05458527153666\\0541712933531:01f78bd56e3232a7c8e15ca38b2d311e7d086901d832c9e272814290edb76152436a31dd"; rur="LDC\\05458527153666\\0541712933653:01f7eaf265c5347c6a98150aed096c93c7f28416075157f5491744f72be82f92a96b225b"', 'Sec-Fetch-Dest': 'empty', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Site': 'same-origin', } response = requests.request("GET", url, headers=headers) data = response.json() return data except: return False data = getTopPosts() grids = data['data']['top']['sections'] comments = [] mediacounter = 0 for grid in grids: medias = grid['layout_content']['medias'] for media in medias: next_cursor = 'permalink_enabled=false' while True: data = getComments(min_id=next_cursor, post_pk=media['media']['pk']) if data is not False: next_cursor = data['next_min_id'] if 'next_min_id' in data else None comments += data['comments'] time.sleep(5) if next_cursor is None or 'cached_comments_cursor' not in next_cursor: mediacounter += 1 print(mediacounter) break print(f"comments counts: {len(comments)}") current = datetime.now() f = open( f"{tag_name}-{current.strftime('%Y-%m-%d-T-%H-%M-%S')}-{media['media']['pk']}-scrapped.json", "w") f.write(json.dumps(comments)) f.close()