From 0501f46803ba9740668fb26589bed894beb22a27 Mon Sep 17 00:00:00 2001 From: Ting-Jun Wang Date: Thu, 8 Dec 2022 01:21:52 +0800 Subject: [PATCH] feat: main functions --- .gitignore | 1 + main.py | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 .gitignore create mode 100644 main.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..2c31100 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +images/* \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..e7006fe --- /dev/null +++ b/main.py @@ -0,0 +1,95 @@ +import requests +import os +import time +import json +from tqdm import tqdm +import sys + +header = { + # 'x-ig-app-id': 'You should find IG App ID on the browser', + 'x-ig-app-id': '936619743392459', + 'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36' + } + +def getLinks(username: str): + print("Get images' links...'") + + # find this user's total media counts + # progress bar + response = requests.get("https://www.instagram.com/api/v1/users/web_profile_info/?username="+username, headers=header) + progress = tqdm(total=json.loads(response.text)['data']['user']['edge_owner_to_timeline_media']['count']) # media counts + + url = "https://www.instagram.com/api/v1/feed/user/{}/username/?count=15".format(username) + links = [] + + # get pictures' links + load_count = 0 + flag = True + while flag: + # We should use "next_max_id" to requests the next group of medias + # But we will not have "next_max_id" in the first requst + if load_count == 0: + response = requests.get(url, headers=header) + url += "&max_id={}" + else: + response = requests.get(url.format(data['next_max_id']), headers=header) + + # get links from responded JSON + data = json.loads(response.text) + for item in data['items']: + if 'carousel_media' in item: + code = item['code'] + + # if the picture exist, break this loop and return links that have images haven't been downloaded + if os.path.isfile("images/{}/{}_0".format(username, code)): + flag = False + break + for index, image in enumerate(item['carousel_media']): + link = image['image_versions2']['candidates'][0]['url'] + links.append((link, "{}_{}".format(code, index))) + else: + link = item['image_versions2']['candidates'][0]['url'] + code = item['code'] + if os.path.isfile("images/{}/{}".format(username, code)): + flag = False + break + links.append((link, code)) + + # update progress bar + progress.update(data['num_results']) + + # check whether need to break + if data['more_available'] == False: + break + + load_count += 1 + time.sleep(0.2) + + if not flag: + print("Only need to get {} images".format(len(links))) + + return links + +def checkDir(username: str): + if not os.path.isdir('images'): + os.mkdir('images') + if not os.path.isdir('images/'+username): + os.mkdir('images/'+username) + +def downloadImages(username, links: list): + print("download images...") + progress = tqdm(total=len(links)) + for link in links: + response = requests.get(link[0]) + with open('images/{}/{}'.format(username, link[1]), 'wb') as fp: + fp.write(response.content) + progress.update(1) + time.sleep(0.5) + +if __name__ == "__main__": + if len(sys.argv) != 2: + print("Usage: python main.py [username]") + else: + username = sys.argv[1] + checkDir(username) + downloadImages( username, getLinks(username) )