instagram-scrap-2/app.py

98 lines
3.7 KiB
Python

import requests
import socket
import socks
import json
import pymongo
import time
import urllib.parse
from datetime import datetime
import signal
import sys
def signal_handling(signum, frame):
print("you chose to end the program")
sys.exit()
signal.signal(signal.SIGINT, signal_handling)
socks.setdefaultproxy(socks.PROXY_TYPE_SOCKS5, '127.0.0.1', 1080)
socket.socket = socks.socksocket
def get_database():
# Provide the mongodb atlas url to connect python to mongodb using pymongo
CONNECTION_STRING = "mongodb://amir:1234@localhost:27017/insta"
# Create a connection using MongoClient. You can import MongoClient or use pymongo.MongoClient
client = pymongo.MongoClient(CONNECTION_STRING)
# Create the database for our example (we will use the same database throughout the tutorial
return client
def get_comments(min_id: str, post_pk):
url = f"https://www.instagram.com/api/v1/media/{post_pk}/comments/?can_support_threading=true&min_id={urllib.parse.quote(str(min_id).encode())}"
if (min_id.startswith("permalink")):
url = f"https://www.instagram.com/api/v1/media/{post_pk}/comments/?can_support_threading=true&permalink_enabled=false"
payload = {}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0',
'Accept': '*/*',
'Accept-Language': 'en-US,en;q=0.5',
'Accept-Encoding': 'gzip, deflate, br',
'X-CSRFToken': 'SyRjkhE11IZJJzEjFYNFOCAkbHObhpp6',
'X-IG-App-ID': '936619743392459',
'X-ASBD-ID': '198387',
'X-IG-WWW-Claim': 'hmac.AR3dSNyXs6hL8Kd5CQUR--JX03KHuY0enjoxwEaUbJ60jFfO',
'X-Requested-With': 'XMLHttpRequest',
'DNT': '1',
'Connection': 'keep-alive',
'Referer': 'https://www.instagram.com/p/Cp7R84moFix/',
'Cookie': 'dpr=1.3636363636363635; ig_did=C7B7A9D0-EAFA-4B62-8AFF-8217A91EA3B8; datr=xKsVZPIxlYi2YshNs71hgDiE; csrftoken=SyRjkhE11IZJJzEjFYNFOCAkbHObhpp6; mid=ZBWrxgALAAEC4bnKm9IP1TZAOBVU; ig_nrcb=1; rur="ASH\\05458490635989\\0541710690982:01f7d99273b41cbcfaffa5a5edd22f59b954c960e0e8a5576dd95f63fe741570bc97a123"; ds_user_id=58490635989; sessionid=58490635989%3Asm9z3FQSns1adH%3A5%3AAYcZBsVcUAY3bASwyC9goX-5mMD89Zxnpbjt29E0ug',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'TE': 'trailers'
}
response = requests.request("GET", url, headers=headers, data=payload)
data = response.json()
return data
# client = get_database()
# db = client['insta']
# cls = db.list_collection_names()
# return nextcursor
target_account_name = "bartar.academy"
target_classification = "seo-web"
if __name__ == "__main__":
try:
i = 0
postsfile = open(f"{target_account_name}.json", 'r', encoding="utf8")
posts = json.loads(postsfile.read())
for post in posts['items']:
nextcursor = 'permalink_enabled=false'
comments = []
i += 1
while True:
data = get_comments(nextcursor, post['pk'])
nextcursor = data['next_min_id'] if 'next_min_id' in data else None
comments += data['comments']
if nextcursor is None or 'cached_comments_cursor' not in nextcursor:
current = datetime.now()
f = open(
f"{i}-{target_classification}-{target_account_name}-{current.strftime('%Y-%m-%d-T-%H-%M-%S')}-{post['pk']}-scrapped.json", "w")
f.write(json.dumps(comments))
f.close()
print(len(comments))
break
time.sleep(3)
except:
pass