98 lines
3.7 KiB
Python
98 lines
3.7 KiB
Python
import requests
|
|
import socket
|
|
import socks
|
|
import json
|
|
import pymongo
|
|
import time
|
|
import urllib.parse
|
|
from datetime import datetime
|
|
import signal
|
|
import sys
|
|
|
|
|
|
def signal_handling(signum, frame):
|
|
print("you chose to end the program")
|
|
sys.exit()
|
|
|
|
|
|
signal.signal(signal.SIGINT, signal_handling)
|
|
|
|
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', 1080)
|
|
socket.socket = socks.socksocket
|
|
|
|
|
|
def get_database():
|
|
|
|
# Provide the mongodb atlas url to connect python to mongodb using pymongo
|
|
CONNECTION_STRING = "mongodb://amir:1234@localhost:27017/insta"
|
|
|
|
# Create a connection using MongoClient. You can import MongoClient or use pymongo.MongoClient
|
|
client = pymongo.MongoClient(CONNECTION_STRING)
|
|
# Create the database for our example (we will use the same database throughout the tutorial
|
|
return client
|
|
|
|
|
|
def get_comments(min_id: str, post_pk):
|
|
url = f"https://www.instagram.com/api/v1/media/{post_pk}/comments/?can_support_threading=true&min_id={urllib.parse.quote(str(min_id).encode())}"
|
|
if (min_id.startswith("permalink")):
|
|
url = f"https://www.instagram.com/api/v1/media/{post_pk}/comments/?can_support_threading=true&permalink_enabled=false"
|
|
|
|
payload = {}
|
|
headers = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0',
|
|
'Accept': '*/*',
|
|
'Accept-Language': 'en-US,en;q=0.5',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
'X-CSRFToken': 'SyRjkhE11IZJJzEjFYNFOCAkbHObhpp6',
|
|
'X-IG-App-ID': '936619743392459',
|
|
'X-ASBD-ID': '198387',
|
|
'X-IG-WWW-Claim': 'hmac.AR3dSNyXs6hL8Kd5CQUR--JX03KHuY0enjoxwEaUbJ60jFfO',
|
|
'X-Requested-With': 'XMLHttpRequest',
|
|
'DNT': '1',
|
|
'Connection': 'keep-alive',
|
|
'Referer': 'https://www.instagram.com/p/Cp7R84moFix/',
|
|
'Cookie': 'dpr=1.3636363636363635; ig_did=C7B7A9D0-EAFA-4B62-8AFF-8217A91EA3B8; datr=xKsVZPIxlYi2YshNs71hgDiE; csrftoken=SyRjkhE11IZJJzEjFYNFOCAkbHObhpp6; mid=ZBWrxgALAAEC4bnKm9IP1TZAOBVU; ig_nrcb=1; rur="ASH\\05458490635989\\0541710690982:01f7d99273b41cbcfaffa5a5edd22f59b954c960e0e8a5576dd95f63fe741570bc97a123"; ds_user_id=58490635989; sessionid=58490635989%3Asm9z3FQSns1adH%3A5%3AAYcZBsVcUAY3bASwyC9goX-5mMD89Zxnpbjt29E0ug',
|
|
'Sec-Fetch-Dest': 'empty',
|
|
'Sec-Fetch-Mode': 'cors',
|
|
'Sec-Fetch-Site': 'same-origin',
|
|
'TE': 'trailers'
|
|
}
|
|
|
|
response = requests.request("GET", url, headers=headers, data=payload)
|
|
data = response.json()
|
|
return data
|
|
|
|
# client = get_database()
|
|
# db = client['insta']
|
|
# cls = db.list_collection_names()
|
|
# return nextcursor
|
|
|
|
|
|
target_account_name = "bartar.academy"
|
|
target_classification = "seo-web"
|
|
if __name__ == "__main__":
|
|
try:
|
|
i = 0
|
|
postsfile = open(f"{target_account_name}.json", 'r', encoding="utf8")
|
|
posts = json.loads(postsfile.read())
|
|
for post in posts['items']:
|
|
nextcursor = 'permalink_enabled=false'
|
|
comments = []
|
|
i += 1
|
|
while True:
|
|
data = get_comments(nextcursor, post['pk'])
|
|
nextcursor = data['next_min_id'] if 'next_min_id' in data else None
|
|
comments += data['comments']
|
|
if nextcursor is None or 'cached_comments_cursor' not in nextcursor:
|
|
current = datetime.now()
|
|
f = open(
|
|
f"{i}-{target_classification}-{target_account_name}-{current.strftime('%Y-%m-%d-T-%H-%M-%S')}-{post['pk']}-scrapped.json", "w")
|
|
f.write(json.dumps(comments))
|
|
f.close()
|
|
print(len(comments))
|
|
break
|
|
time.sleep(3)
|
|
|
|
except:
|
|
pass
|