diff --git a/CHANGELOG b/CHANGELOG index aab3e83c..23f53d0c 100644 --- a/CHANGELOG +++ b/CHANGELOG @@ -1,5 +1,38 @@ Instagram PHP Scraper Change Log ================================ +Version v0.4.5 +-------------- +- Enh: BIG thing. Get best resolutions for medias possible + +Version v0.4.4 +-------------- +- Bug: Image urls broken for some medias in getMediaByCode() + +Version v0.4.3 +-------------- +- Enh: Make methods getContentsFromUrl() and generateRandomString() static + +Version v0.4.2 +-------------- +- Bug: Fix 405 error with fetching accounts by id +- Bug: Fix 405 error with fetching comments by id + +Version v0.4.1 +-------------- +- Enh: Url encode endpoints to support for example Japan language + +Version v0.4.0 +-------------- +- Enh: New methods to paginate medias getPaginateMedias() and getPaginateMediasByTag() + +Version v0.3.5 +-------------- +- Enh: Convenient media pagination getMediasByTag() + +Version v0.3.4 +-------------- +- Bug: Fix media urls + Version v0.3.3 -------------- - Bug: Include class Location and Comment in InstagramScraper.php diff --git a/README.md b/README.md index 89dc902b..712aec70 100644 --- a/README.md +++ b/README.md @@ -35,9 +35,9 @@ echo $account->username; ### Search users by username ```php -$medias = Instagram::searchAccountsByUsername('durov'); +$users = Instagram::searchAccountsByUsername('durov'); echo '
';
-echo json_encode($medias);
+echo json_encode($users);
 echo '

'; ``` @@ -56,9 +56,20 @@ Available properties: $imageStandardResolutionUrl; $imageHighResolutionUrl; $caption; + $captionIsEdited; + $isAd; $videoLowResolutionUrl; $videoStandardResolutionUrl; $videoLowBandwidthUrl; + $videoViews; + $code; + $owner; + $ownerId; + $likesCount; + $locationId; + $locationName; + $commentsCount; + */ echo $medias[0]->imageHighResolutionUrl; echo $medias[0]->caption; @@ -163,4 +174,7 @@ $medias = Instagram::getLocationTopMediasById(1); ### Get location medias by location id ```php $medias = Instagram::getLocationMediasById(1); -``` \ No newline at end of file +``` + +### Other +Java library: https://github.com/postaddictme/instagram-java-scraper \ No newline at end of file diff --git a/index.php b/index.php index 2a6424f9..2f342aaf 100644 --- a/index.php +++ b/index.php @@ -3,13 +3,15 @@ require_once 'vendor/autoload.php'; require_once 'src/InstagramScraper.php'; -use InstagramScraper\Exception\InstagramException; + use InstagramScraper\Instagram; -$instagram = new Instagram(); try { - $medias = Instagram::getMedias('kevin', 1000); - echo $medias[998]->imageThumbnailUrl; +// $medias = Instagram::getMedias('kevin', 1497); +// echo json_encode($medias[1497]); + $medias = InstagramScraper\Instagram::getMediasByTag('paveldurov', 300); + echo sizeof($medias) . '\n'; +// echo json_encode($medias); } catch (\Exception $ex) { print_r($ex); } diff --git a/src/InstagramScraper/Endpoints.php b/src/InstagramScraper/Endpoints.php index a6e90900..1347f360 100644 --- a/src/InstagramScraper/Endpoints.php +++ b/src/InstagramScraper/Endpoints.php @@ -13,11 +13,13 @@ class Endpoints const MEDIA_JSON_BY_LOCATION_ID = 'https://www.instagram.com/explore/locations/{{facebookLocationId}}/?__a=1&max_id={{maxId}}'; const MEDIA_JSON_BY_TAG = 'https://www.instagram.com/explore/tags/{tag}/?__a=1&max_id={max_id}'; const GENERAL_SEARCH = 'https://www.instagram.com/web/search/topsearch/?query={query}'; - const ACCOUNT_JSON_INFO_BY_ID = 'https://www.instagram.com/query/?q=ig_user({userId}){id,username,external_url,full_name,profile_pic_url,biography,followed_by{count},follows{count},media{count},is_private,is_verified}'; - const LAST_COMMENTS_BY_CODE = 'https://www.instagram.com/query/?q=ig_shortcode({{code}}){comments.last({{count}}){count,nodes{id,created_at,text,user{id,profile_pic_url,username,follows{count},followed_by{count},biography,full_name,media{count},is_private,external_url,is_verified}},page_info}}'; - const COMMENTS_BEFORE_COMMENT_ID_BY_CODE = 'https://www.instagram.com/query/?q=ig_shortcode({{code}}){comments.before({{commentId}},{{count}}){count,nodes{id,created_at,text,user{id,profile_pic_url,username,follows{count},followed_by{count},biography,full_name,media{count},is_private,external_url,is_verified}},page_info}}'; - const LAST_LIKES_BY_CODE = 'https://www.instagram.com/query/?q=ig_shortcode({{code}}){likes{nodes{id,user{id,profile_pic_url,username,follows{count},followed_by{count},biography,full_name,media{count},is_private,external_url,is_verified}},page_info}}'; + const ACCOUNT_JSON_INFO_BY_ID = 'ig_user({userId}){id,username,external_url,full_name,profile_pic_url,biography,followed_by{count},follows{count},media{count},is_private,is_verified}'; + const LAST_COMMENTS_BY_CODE = 'ig_shortcode({{code}}){comments.last({{count}}){count,nodes{id,created_at,text,user{id,profile_pic_url,username,follows{count},followed_by{count},biography,full_name,media{count},is_private,external_url,is_verified}},page_info}}'; + const COMMENTS_BEFORE_COMMENT_ID_BY_CODE = 'ig_shortcode({{code}}){comments.before({{commentId}},{{count}}){count,nodes{id,created_at,text,user{id,profile_pic_url,username,follows{count},followed_by{count},biography,full_name,media{count},is_private,external_url,is_verified}},page_info}}'; + const LAST_LIKES_BY_CODE = 'ig_shortcode({{code}}){likes{nodes{id,user{id,profile_pic_url,username,follows{count},followed_by{count},biography,full_name,media{count},is_private,external_url,is_verified}},page_info}}'; + const INSTAGRAM_QUERY_URL = 'https://www.instagram.com/query/'; + const INSTAGRAM_CDN_URL = 'https://scontent.cdninstagram.com/'; public static function getAccountPageLink($username) { diff --git a/src/InstagramScraper/Instagram.php b/src/InstagramScraper/Instagram.php index 15b5894e..d83df454 100644 --- a/src/InstagramScraper/Instagram.php +++ b/src/InstagramScraper/Instagram.php @@ -34,18 +34,11 @@ public static function getAccount($username) public static function getAccountById($id) { - if (!is_numeric($id)) { throw new \InvalidArgumentException('User id must be integer or integer wrapped in string'); } - $response = Request::get(Endpoints::getAccountJsonInfoLinkByAccountId($id)); - if ($response->code === 404) { - throw new InstagramNotFoundException('Account with given username does not exist.'); - } - if ($response->code !== 200) { - throw new InstagramException('Response code is ' . $response->code . '. Body: ' . $response->body . ' Something went wrong. Please report issue.'); - } - $userArray = json_decode($response->raw_body, true); + $parameters = Endpoints::getAccountJsonInfoLinkByAccountId($id); + $userArray = json_decode(self::getContentsFromUrl($parameters), true); if ($userArray['status'] === 'fail') { throw new InstagramException($userArray['message']); } @@ -55,6 +48,38 @@ public static function getAccountById($id) return Account::fromAccountPage($userArray); } + private static function getContentsFromUrl($parameters) + { + if (!function_exists('curl_init')) { + return false; + } + $random = self::generateRandomString(); + $ch = curl_init(); + curl_setopt($ch, CURLOPT_URL, Endpoints::INSTAGRAM_QUERY_URL); + curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); + curl_setopt($ch, CURLOPT_POST, 1); + curl_setopt($ch, CURLOPT_POSTFIELDS, 'q=' . $parameters); + $headers = array(); + $headers[] = "Cookie: csrftoken=$random;"; + $headers[] = "X-Csrftoken: $random"; + $headers[] = "Referer: https://www.instagram.com/"; + curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); + $output = curl_exec($ch); + curl_close($ch); + return $output; + } + + private static function generateRandomString($length = 10) + { + $characters = '0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'; + $charactersLength = strlen($characters); + $randomString = ''; + for ($i = 0; $i < $length; $i++) { + $randomString .= $characters[rand(0, $charactersLength - 1)]; + } + return $randomString; + } + public static function getMedias($username, $count = 20, $maxId = '') { $index = 0; @@ -95,8 +120,8 @@ public static function getPaginateMedias($username, $maxId = '') $medias = []; $toReturn = [ - 'medias' => $medias, - 'maxId' => $maxId, + 'medias' => $medias, + 'maxId' => $maxId, 'hasNextPage' => $hasNextPage ]; @@ -124,8 +149,8 @@ public static function getPaginateMedias($username, $maxId = '') $hasNextPage = $arr['more_available']; $toReturn = [ - 'medias' => $medias, - 'maxId' => $maxId, + 'medias' => $medias, + 'maxId' => $maxId, 'hasNextPage' => $hasNextPage ]; @@ -160,6 +185,7 @@ public static function getMediasByTag($tag, $count = 12, $maxId = '') { $index = 0; $medias = []; + $mediaIds = []; $hasNextPage = true; while ($index < $count && $hasNextPage) { $response = Request::get(Endpoints::getMediasJsonByTagLink($tag, $maxId)); @@ -179,7 +205,12 @@ public static function getMediasByTag($tag, $count = 12, $maxId = '') if ($index === $count) { return $medias; } - $medias[] = Media::fromTagPage($mediaArray); + $media = Media::fromTagPage($mediaArray); + if (in_array($media->id, $mediaIds)) { + return $medias; + } + $mediaIds[] = $media->id; + $medias[] = $media; $index++; } if (count($nodes) == 0) { @@ -197,8 +228,8 @@ public static function getPaginateMediasByTag($tag, $maxId = '') $medias = []; $toReturn = [ - 'medias' => $medias, - 'maxId' => $maxId, + 'medias' => $medias, + 'maxId' => $maxId, 'hasNextPage' => $hasNextPage ]; @@ -230,11 +261,13 @@ public static function getPaginateMediasByTag($tag, $maxId = '') $maxId = $arr['tag']['media']['page_info']['end_cursor']; $hasNextPage = $arr['tag']['media']['page_info']['has_next_page']; + $count = $arr['tag']['media']['count']; $toReturn = [ - 'medias' => $medias, - 'maxId' => $maxId, - 'hasNextPage' => $hasNextPage + 'medias' => $medias, + 'count' => $count, + 'maxId' => $maxId, + 'hasNextPage' => $hasNextPage, ]; return $toReturn; @@ -336,17 +369,12 @@ public static function getMediaCommentsByCode($code, $count = 10, $maxId = null) $remain = 0; } if (!isset($maxId)) { - $response = Request::get(Endpoints::getLastCommentsByCodeLink($code, $numberOfCommentsToRetreive)); + $parameters = Endpoints::getLastCommentsByCodeLink($code, $numberOfCommentsToRetreive); + } else { - $response = Request::get(Endpoints::getCommentsBeforeCommentIdByCode($code, $numberOfCommentsToRetreive, $maxId)); + $parameters = Endpoints::getCommentsBeforeCommentIdByCode($code, $numberOfCommentsToRetreive, $maxId); } - if ($response->code === 404) { - throw new InstagramNotFoundException('Account with given username does not exist.'); - } - if ($response->code !== 200) { - throw new InstagramException('Response code is ' . $response->code . '. Body: ' . $response->body . ' Something went wrong. Please report issue.'); - } - $jsonResponse = json_decode($response->raw_body, true); + $jsonResponse = json_decode(self::getContentsFromUrl($parameters), true); $nodes = $jsonResponse['comments']['nodes']; foreach ($nodes as $commentArray) { $comments[] = Comment::fromApi($commentArray); @@ -422,21 +450,4 @@ public static function getLocationById($facebookLocationId) $jsonResponse = json_decode($response->raw_body, true); return Location::makeLocation($jsonResponse['location']); } - - public static function getLastLikesByCode($code) - { - $response = Request::get(Endpoints::getLastLikesByCodeLink($code)); - if ($response->code === 404) { - throw new InstagramNotFoundException('Media with this shortcode doesn\'t exist'); - } - if ($response->code !== 200) { - throw new InstagramException('Response code is ' . $response->code . '. Body: ' . $response->body . ' Something went wrong. Please report issue.'); - } - $jsonResponse = json_decode($response->raw_body, true); - $users = []; - foreach ($jsonResponse['likes']['nodes'] as $userArray) { - $users[] = Account::fromAccountPage($userArray['user']); - } - return $users; - } } \ No newline at end of file diff --git a/src/InstagramScraper/Model/Media.php b/src/InstagramScraper/Model/Media.php index de47cda7..089f0427 100644 --- a/src/InstagramScraper/Model/Media.php +++ b/src/InstagramScraper/Model/Media.php @@ -43,10 +43,11 @@ public static function fromApi($mediaArray) $instance->link = $mediaArray['link']; $instance->commentsCount = $mediaArray['comments']['count']; $instance->likesCount = $mediaArray['likes']['count']; - $instance->imageLowResolutionUrl = self::getCleanImageUrl($mediaArray['images']['low_resolution']['url']); - $instance->imageThumbnailUrl = self::getCleanImageUrl($mediaArray['images']['thumbnail']['url']); - $instance->imageStandardResolutionUrl = self::getCleanImageUrl($mediaArray['images']['standard_resolution']['url']); - $instance->imageHighResolutionUrl = str_replace('320x320', '1080x1080', $instance->imageLowResolutionUrl); + $images = self::getImageUrls($mediaArray['images']['standard_resolution']['url']); + $instance->imageLowResolutionUrl = $images['low']; + $instance->imageThumbnailUrl = $images['thumbnail']; + $instance->imageStandardResolutionUrl = $images['standard']; + $instance->imageHighResolutionUrl = $images['high']; if (isset($mediaArray['caption'])) { $instance->caption = $mediaArray['caption']['text']; } @@ -54,9 +55,11 @@ public static function fromApi($mediaArray) if (isset($mediaArray['video_views'])) { $instance->videoViews = $mediaArray['video_views']; } - $instance->videoLowResolutionUrl = $mediaArray['videos']['low_resolution']['url']; - $instance->videoStandardResolutionUrl = $mediaArray['videos']['standard_resolution']['url']; - $instance->videoLowBandwidthUrl = $mediaArray['videos']['low_bandwidth']['url']; + if (isset($mediaArray['videos'])) { + $instance->videoLowResolutionUrl = $mediaArray['videos']['low_resolution']['url']; + $instance->videoStandardResolutionUrl = $mediaArray['videos']['standard_resolution']['url']; + $instance->videoLowBandwidthUrl = $mediaArray['videos']['low_bandwidth']['url']; + } } if (isset($mediaArray['location']['id'])) { $instance->locationId = $mediaArray['location']['id']; @@ -67,9 +70,17 @@ public static function fromApi($mediaArray) return $instance; } - private static function getCleanImageUrl($imageUrl) + private static function getImageUrls($imageUrl) { - return strpos($imageUrl, '?ig_cache_key=') ? substr($imageUrl, 0, strpos($imageUrl, '?ig_cache_key=')) : $imageUrl; + $parts = explode('/', parse_url($imageUrl)['path']); + $imageName = $parts[sizeof($parts) - 1]; + $urls = [ + 'thumbnail' => Endpoints::INSTAGRAM_CDN_URL . 't/s150x150/' . $imageName, + 'low' => Endpoints::INSTAGRAM_CDN_URL . 't/s320x320/' . $imageName, + 'standard' => Endpoints::INSTAGRAM_CDN_URL . 't/s640x640/' . $imageName, + 'high' => Endpoints::INSTAGRAM_CDN_URL . 't/' . $imageName + ]; + return $urls; } public static function fromMediaPage($mediaArray) @@ -111,29 +122,6 @@ public static function fromMediaPage($mediaArray) return $instance; } - private static function getImageUrls($imageUrl) - { - $imageUrl = self::getCleanImageUrl($imageUrl); - $parts = explode('/', parse_url($imageUrl)['path']); - if (sizeof($parts) == 4) { - $standard = 'https://scontent.cdninstagram.com/' . $parts[1] . '/s640x640/' . $parts[2] . '/' . $parts[3]; - } else { - if (isset($parts[4]) && $parts[4][0] == 'p') { - $standard = 'https://scontent.cdninstagram.com/' . $parts[1] . '/p640x640/' . $parts[3] . '/' . $parts[4]; - } else { - $standard = 'https://scontent.cdninstagram.com/' . $parts[1] . '/s640x640/' . $parts[3] . '/' . $parts[4]; - } - } - - $urls = [ - 'standard' => $standard, - 'low' => str_replace('640x640', '320x320', $standard), - 'high' => str_replace('640x640', '1080x1080', $standard), - 'thumbnail' => str_replace('640x640', '150x150', $standard) - ]; - return $urls; - } - public static function fromTagPage($mediaArray) { $instance = new self(); @@ -146,8 +134,11 @@ public static function fromTagPage($mediaArray) $instance->caption = $mediaArray['caption']; } $instance->createdTime = $mediaArray['date']; - $instance->imageThumbnailUrl = self::getCleanImageUrl($mediaArray['thumbnail_src']); - $instance->imageStandardResolutionUrl = self::getCleanImageUrl($mediaArray['display_src']); + $images = self::getImageUrls($mediaArray['display_src']); + $instance->imageStandardResolutionUrl = $images['standard']; + $instance->imageLowResolutionUrl = $images['low']; + $instance->imageHighResolutionUrl = $images['high']; + $instance->imageThumbnailUrl = $images['thumbnail']; $instance->type = 'image'; if ($mediaArray['is_video']) { $instance->type = 'video'; @@ -159,21 +150,13 @@ public static function fromTagPage($mediaArray) public static function getIdFromCode($code) { - $alphabet = [ - '-' => 62, '1' => 53, '0' => 52, '3' => 55, '2' => 54, '5' => 57, '4' => 56, '7' => 59, '6' => 58, '9' => 61, - '8' => 60, 'A' => 0, 'C' => 2, 'B' => 1, 'E' => 4, 'D' => 3, 'G' => 6, 'F' => 5, 'I' => 8, 'H' => 7, - 'K' => 10, 'J' => 9, 'M' => 12, 'L' => 11, 'O' => 14, 'N' => 13, 'Q' => 16, 'P' => 15, 'S' => 18, 'R' => 17, - 'U' => 20, 'T' => 19, 'W' => 22, 'V' => 21, 'Y' => 24, 'X' => 23, 'Z' => 25, '_' => 63, 'a' => 26, 'c' => 28, - 'b' => 27, 'e' => 30, 'd' => 29, 'g' => 32, 'f' => 31, 'i' => 34, 'h' => 33, 'k' => 36, 'j' => 35, 'm' => 38, - 'l' => 37, 'o' => 40, 'n' => 39, 'q' => 42, 'p' => 41, 's' => 44, 'r' => 43, 'u' => 46, 't' => 45, 'w' => 48, - 'v' => 47, 'y' => 50, 'x' => 49, 'z' => 51 - ]; - $n = 0; + $alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'; + $id = 0; for ($i = 0; $i < strlen($code); $i++) { $c = $code[$i]; - $n = $n * 64 + $alphabet[$c]; + $id = $id * 64 + strpos($alphabet, $c); } - return $n; + return $id; } public static function getLinkFromId($id) @@ -187,12 +170,12 @@ public static function getCodeFromId($id) $parts = explode('_', $id); $id = $parts[0]; $alphabet = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_'; - $shortenedId = ''; + $code = ''; while ($id > 0) { $remainder = $id % 64; $id = ($id - $remainder) / 64; - $shortenedId = $alphabet{$remainder} . $shortenedId; + $code = $alphabet{$remainder} . $code; }; - return $shortenedId; + return $code; } -} +} \ No newline at end of file diff --git a/tests/InstagramTest.php b/tests/InstagramTest.php index 3e840bdc..fa13e404 100644 --- a/tests/InstagramTest.php +++ b/tests/InstagramTest.php @@ -3,6 +3,7 @@ require '../vendor/autoload.php'; use InstagramScraper\Instagram; +use InstagramScraper\Model\Media; use PHPUnit\Framework\TestCase; @@ -63,4 +64,20 @@ public function testGetLocationById() $location = Instagram::getLocationById(1); $this->assertEquals('Dog Patch Labs', $location->name); } + + public function testGetIdFromCode() + { + $code = Media::getCodeFromId('1270593720437182847'); + $this->assertEquals('BGiDkHAgBF_', $code); + $code = Media::getCodeFromId('1270593720437182847_3'); + $this->assertEquals('BGiDkHAgBF_', $code); + $code = Media::getCodeFromId(1270593720437182847); + $this->assertEquals('BGiDkHAgBF_', $code); + } + + public function testGetCodeFromId() + { + $id = Media::getIdFromCode('BGiDkHAgBF_'); + $this->assertEquals(1270593720437182847, $id); + } } \ No newline at end of file