instagram-scrap-2/crawler.py

103 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import json
import time
import requests
import socket
import socks
import urllib.parse
from datetime import datetime
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', 1080)
socket.socket = socks.socksocket
tag_name = 'برنامه_نویسی'
def getTopPosts():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'X-CSRFToken': '3rSOOhL1DSUQT2czImiCbMevgLgGx6kZ',
'X-IG-App-ID': '936619743392459',
'X-ASBD-ID': '198387',
'X-IG-WWW-Claim': 'hmac.AR0OphPNsUYX-i9oXNzfh6JF3hDx_3eDAjxFFjWI0DYyKsO6',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
'Referer': 'https://www.instagram.com/explore/tags/%D8%A8%D8%B1%D9%86%D8%A7%D9%85%D9%87_%D9%86%D9%88%DB%8C%D8%B3%DB%8C/',
'Cookie': 'ig_did=ACEC5413-54FC-4EFA-B0AF-C5F43D60BB8A; datr=usoZZDUz2YHk2UAtkauI2jWW; mid=ZBnKwwALAAEW_HOHO2zCuHUsHbG8; ig_nrcb=1; sessionid=58527153666%3AszpE7BNZUXL1Z9%3A7%3AAYelK_Wp6wCVw9U89yijo3VXXbDpn4zdD9Q-rOXy-Q; ds_user_id=58527153666; csrftoken=3rSOOhL1DSUQT2czImiCbMevgLgGx6kZ; dpr=1.25; shbid="16293\\05458527153666\\0541712933531:01f7f3123fe1f37549d037b1a1411e119e8d36295768b16f0d4189989400a3f6c4719399"; shbts="1681397531\\05458527153666\\0541712933531:01f78bd56e3232a7c8e15ca38b2d311e7d086901d832c9e272814290edb76152436a31dd"; rur="LDC\\05458527153666\\0541712933636:01f75496bea7dd870ec16fa6f07ff96b9d518c7d09d2d1b3c6b84cdcbd06689e2ae5bc1e"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
}
params = {
'tag_name': tag_name,
}
response = requests.get(
'https://www.instagram.com/api/v1/tags/web_info/', params=params, headers=headers)
return response.json()
def getComments(min_id: str, post_pk):
try:
url = f"https://www.instagram.com/api/v1/media/{post_pk}/comments/?can_support_threading=true&min_id={urllib.parse.quote(str(min_id).encode())}"
if (min_id.startswith("permalink")):
url = f"https://www.instagram.com/api/v1/media/{post_pk}/comments/?can_support_threading=true&permalink_enabled=false"
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'X-CSRFToken': '3rSOOhL1DSUQT2czImiCbMevgLgGx6kZ',
'X-IG-App-ID': '936619743392459',
'X-ASBD-ID': '198387',
'X-IG-WWW-Claim': 'hmac.AR0OphPNsUYX-i9oXNzfh6JF3hDx_3eDAjxFFjWI0DYyKsO6',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
'Referer': 'https://www.instagram.com/p/CqkUyB6ICbR/',
'Cookie': 'ig_did=ACEC5413-54FC-4EFA-B0AF-C5F43D60BB8A; datr=usoZZDUz2YHk2UAtkauI2jWW; mid=ZBnKwwALAAEW_HOHO2zCuHUsHbG8; ig_nrcb=1; sessionid=58527153666%3AszpE7BNZUXL1Z9%3A7%3AAYelK_Wp6wCVw9U89yijo3VXXbDpn4zdD9Q-rOXy-Q; ds_user_id=58527153666; csrftoken=3rSOOhL1DSUQT2czImiCbMevgLgGx6kZ; dpr=1.25; shbid="16293\\05458527153666\\0541712933531:01f7f3123fe1f37549d037b1a1411e119e8d36295768b16f0d4189989400a3f6c4719399"; shbts="1681397531\\05458527153666\\0541712933531:01f78bd56e3232a7c8e15ca38b2d311e7d086901d832c9e272814290edb76152436a31dd"; rur="LDC\\05458527153666\\0541712933653:01f7eaf265c5347c6a98150aed096c93c7f28416075157f5491744f72be82f92a96b225b"',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
}
response = requests.request("GET", url, headers=headers)
data = response.json()
return data
except:
return False
data = getTopPosts()
grids = data['data']['top']['sections']
comments = []
mediacounter = 0
for grid in grids:
medias = grid['layout_content']['medias']
for media in medias:
next_cursor = 'permalink_enabled=false'
while True:
data = getComments(min_id=next_cursor,
post_pk=media['media']['pk'])
if data is not False:
next_cursor = data['next_min_id'] if 'next_min_id' in data else None
comments += data['comments']
time.sleep(5)
if next_cursor is None or 'cached_comments_cursor' not in next_cursor:
mediacounter += 1
print(mediacounter)
break
print(f"comments counts: {len(comments)}")
current = datetime.now()
f = open(
f"{tag_name}-{current.strftime('%Y-%m-%d-T-%H-%M-%S')}-{media['media']['pk']}-scrapped.json", "w")
f.write(json.dumps(comments))
f.close()