103 lines
4.8 KiB
Python
103 lines
4.8 KiB
Python
import json
|
||
import time
|
||
import requests
|
||
import socket
|
||
import socks
|
||
import urllib.parse
|
||
from datetime import datetime
|
||
|
||
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', 1080)
|
||
socket.socket = socks.socksocket
|
||
tag_name = 'برنامه_نویسی'
|
||
|
||
|
||
def getTopPosts():
|
||
|
||
headers = {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0',
|
||
'Accept': '*/*',
|
||
'Accept-Language': 'en-US,en;q=0.5',
|
||
'Accept-Encoding': 'gzip, deflate, br',
|
||
'X-CSRFToken': '3rSOOhL1DSUQT2czImiCbMevgLgGx6kZ',
|
||
'X-IG-App-ID': '936619743392459',
|
||
'X-ASBD-ID': '198387',
|
||
'X-IG-WWW-Claim': 'hmac.AR0OphPNsUYX-i9oXNzfh6JF3hDx_3eDAjxFFjWI0DYyKsO6',
|
||
'X-Requested-With': 'XMLHttpRequest',
|
||
'Connection': 'keep-alive',
|
||
'Referer': 'https://www.instagram.com/explore/tags/%D8%A8%D8%B1%D9%86%D8%A7%D9%85%D9%87_%D9%86%D9%88%DB%8C%D8%B3%DB%8C/',
|
||
'Cookie': 'ig_did=ACEC5413-54FC-4EFA-B0AF-C5F43D60BB8A; datr=usoZZDUz2YHk2UAtkauI2jWW; mid=ZBnKwwALAAEW_HOHO2zCuHUsHbG8; ig_nrcb=1; sessionid=58527153666%3AszpE7BNZUXL1Z9%3A7%3AAYelK_Wp6wCVw9U89yijo3VXXbDpn4zdD9Q-rOXy-Q; ds_user_id=58527153666; csrftoken=3rSOOhL1DSUQT2czImiCbMevgLgGx6kZ; dpr=1.25; shbid="16293\\05458527153666\\0541712933531:01f7f3123fe1f37549d037b1a1411e119e8d36295768b16f0d4189989400a3f6c4719399"; shbts="1681397531\\05458527153666\\0541712933531:01f78bd56e3232a7c8e15ca38b2d311e7d086901d832c9e272814290edb76152436a31dd"; rur="LDC\\05458527153666\\0541712933636:01f75496bea7dd870ec16fa6f07ff96b9d518c7d09d2d1b3c6b84cdcbd06689e2ae5bc1e"',
|
||
'Sec-Fetch-Dest': 'empty',
|
||
'Sec-Fetch-Mode': 'cors',
|
||
'Sec-Fetch-Site': 'same-origin',
|
||
}
|
||
|
||
params = {
|
||
'tag_name': tag_name,
|
||
}
|
||
|
||
response = requests.get(
|
||
'https://www.instagram.com/api/v1/tags/web_info/', params=params, headers=headers)
|
||
return response.json()
|
||
|
||
|
||
def getComments(min_id: str, post_pk):
|
||
try:
|
||
url = f"https://www.instagram.com/api/v1/media/{post_pk}/comments/?can_support_threading=true&min_id={urllib.parse.quote(str(min_id).encode())}"
|
||
if (min_id.startswith("permalink")):
|
||
url = f"https://www.instagram.com/api/v1/media/{post_pk}/comments/?can_support_threading=true&permalink_enabled=false"
|
||
|
||
headers = {
|
||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0',
|
||
'Accept': '*/*',
|
||
'Accept-Language': 'en-US,en;q=0.5',
|
||
'Accept-Encoding': 'gzip, deflate, br',
|
||
'X-CSRFToken': '3rSOOhL1DSUQT2czImiCbMevgLgGx6kZ',
|
||
'X-IG-App-ID': '936619743392459',
|
||
'X-ASBD-ID': '198387',
|
||
'X-IG-WWW-Claim': 'hmac.AR0OphPNsUYX-i9oXNzfh6JF3hDx_3eDAjxFFjWI0DYyKsO6',
|
||
'X-Requested-With': 'XMLHttpRequest',
|
||
'Connection': 'keep-alive',
|
||
'Referer': 'https://www.instagram.com/p/CqkUyB6ICbR/',
|
||
'Cookie': 'ig_did=ACEC5413-54FC-4EFA-B0AF-C5F43D60BB8A; datr=usoZZDUz2YHk2UAtkauI2jWW; mid=ZBnKwwALAAEW_HOHO2zCuHUsHbG8; ig_nrcb=1; sessionid=58527153666%3AszpE7BNZUXL1Z9%3A7%3AAYelK_Wp6wCVw9U89yijo3VXXbDpn4zdD9Q-rOXy-Q; ds_user_id=58527153666; csrftoken=3rSOOhL1DSUQT2czImiCbMevgLgGx6kZ; dpr=1.25; shbid="16293\\05458527153666\\0541712933531:01f7f3123fe1f37549d037b1a1411e119e8d36295768b16f0d4189989400a3f6c4719399"; shbts="1681397531\\05458527153666\\0541712933531:01f78bd56e3232a7c8e15ca38b2d311e7d086901d832c9e272814290edb76152436a31dd"; rur="LDC\\05458527153666\\0541712933653:01f7eaf265c5347c6a98150aed096c93c7f28416075157f5491744f72be82f92a96b225b"',
|
||
'Sec-Fetch-Dest': 'empty',
|
||
'Sec-Fetch-Mode': 'cors',
|
||
'Sec-Fetch-Site': 'same-origin',
|
||
|
||
|
||
}
|
||
|
||
response = requests.request("GET", url, headers=headers)
|
||
data = response.json()
|
||
return data
|
||
except:
|
||
return False
|
||
|
||
|
||
data = getTopPosts()
|
||
grids = data['data']['top']['sections']
|
||
comments = []
|
||
mediacounter = 0
|
||
for grid in grids:
|
||
medias = grid['layout_content']['medias']
|
||
for media in medias:
|
||
next_cursor = 'permalink_enabled=false'
|
||
|
||
while True:
|
||
data = getComments(min_id=next_cursor,
|
||
post_pk=media['media']['pk'])
|
||
if data is not False:
|
||
next_cursor = data['next_min_id'] if 'next_min_id' in data else None
|
||
comments += data['comments']
|
||
time.sleep(5)
|
||
if next_cursor is None or 'cached_comments_cursor' not in next_cursor:
|
||
mediacounter += 1
|
||
print(mediacounter)
|
||
break
|
||
print(f"comments counts: {len(comments)}")
|
||
|
||
current = datetime.now()
|
||
f = open(
|
||
f"{tag_name}-{current.strftime('%Y-%m-%d-T-%H-%M-%S')}-{media['media']['pk']}-scrapped.json", "w")
|
||
f.write(json.dumps(comments))
|
||
f.close()
|