From 0501f46803ba9740668fb26589bed894beb22a27 Mon Sep 17 00:00:00 2001
From: Ting-Jun Wang <levi900227@gmail.com>
Date: Thu, 8 Dec 2022 01:21:52 +0800
Subject: [PATCH] feat: main functions

---
 .gitignore |  1 +
 main.py    | 95 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 96 insertions(+)
 create mode 100644 .gitignore
 create mode 100644 main.py

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2c31100
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+images/*
\ No newline at end of file
diff --git a/main.py b/main.py
new file mode 100644
index 0000000..e7006fe
--- /dev/null
+++ b/main.py
@@ -0,0 +1,95 @@
+import requests
+import os
+import time
+import json
+from tqdm import tqdm
+import sys
+
+header = {
+        # 'x-ig-app-id': 'You should find IG App ID on the browser',
+        'x-ig-app-id': '936619743392459',
+        'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
+    }
+
+def getLinks(username: str):
+    print("Get images' links...'")
+
+    # find this user's total media counts
+    # progress bar
+    response = requests.get("https://www.instagram.com/api/v1/users/web_profile_info/?username="+username, headers=header)
+    progress = tqdm(total=json.loads(response.text)['data']['user']['edge_owner_to_timeline_media']['count'])               # media counts
+
+    url = "https://www.instagram.com/api/v1/feed/user/{}/username/?count=15".format(username)
+    links = []
+
+    # get pictures' links
+    load_count = 0
+    flag = True
+    while flag:
+        # We should use "next_max_id" to requests the next group of medias
+        # But we will not have "next_max_id" in the first requst
+        if load_count == 0:
+            response = requests.get(url, headers=header)
+            url += "&max_id={}"
+        else:
+            response = requests.get(url.format(data['next_max_id']), headers=header)
+
+        # get links from responded JSON
+        data = json.loads(response.text)
+        for item in data['items']:
+            if 'carousel_media' in item:
+                code = item['code']
+                
+                # if the picture exist, break this loop and return links that have images haven't been downloaded
+                if os.path.isfile("images/{}/{}_0".format(username, code)):
+                    flag = False
+                    break
+                for index, image in enumerate(item['carousel_media']):
+                    link = image['image_versions2']['candidates'][0]['url']
+                    links.append((link, "{}_{}".format(code, index)))
+            else:
+                link = item['image_versions2']['candidates'][0]['url']
+                code = item['code']
+                if os.path.isfile("images/{}/{}".format(username, code)):
+                    flag = False
+                    break
+                links.append((link, code))
+
+        # update progress bar
+        progress.update(data['num_results'])
+
+        # check whether need to break
+        if data['more_available'] == False:
+            break
+
+        load_count += 1
+        time.sleep(0.2)
+
+    if not flag:
+        print("Only need to get {} images".format(len(links)))
+
+    return links
+
+def checkDir(username: str):
+    if not os.path.isdir('images'):
+        os.mkdir('images')
+    if not os.path.isdir('images/'+username):
+        os.mkdir('images/'+username)
+
+def downloadImages(username, links: list):
+    print("download images...")
+    progress = tqdm(total=len(links))
+    for link in links:
+        response = requests.get(link[0])
+        with open('images/{}/{}'.format(username, link[1]), 'wb') as fp:
+            fp.write(response.content)
+        progress.update(1)
+        time.sleep(0.5)
+
+if __name__ == "__main__":
+    if len(sys.argv) != 2:
+        print("Usage: python main.py [username]")
+    else:
+        username = sys.argv[1]
+        checkDir(username)
+        downloadImages( username, getLinks(username) )