939 lines
30 KiB
PHP
939 lines
30 KiB
PHP
<?php
|
|
/**
|
|
* Instagram Scraper Class
|
|
*
|
|
* Handles scraping of public Instagram profiles without official API.
|
|
* Uses multiple fallback methods and anti-blocking measures.
|
|
*
|
|
* @package Instagram_Gallery_Sync_Pro
|
|
*/
|
|
|
|
// Prevent direct access
|
|
if (!defined('ABSPATH')) {
|
|
exit;
|
|
}
|
|
|
|
/**
|
|
* Class IGSP_Scraper
|
|
*/
|
|
class IGSP_Scraper
|
|
{
|
|
|
|
/**
|
|
* Array of User-Agents for rotation
|
|
*
|
|
* @var array
|
|
*/
|
|
private $user_agents = array(
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
|
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0',
|
|
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.3 Safari/605.1.15',
|
|
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
|
|
);
|
|
|
|
/**
|
|
* Request timeout in seconds
|
|
*
|
|
* @var int
|
|
*/
|
|
private $timeout;
|
|
|
|
/**
|
|
* Maximum retry attempts
|
|
*
|
|
* @var int
|
|
*/
|
|
private $max_retries = 3;
|
|
|
|
/**
|
|
* Minimum delay between requests (seconds)
|
|
*
|
|
* @var int
|
|
*/
|
|
private $min_delay = 2;
|
|
|
|
/**
|
|
* Maximum delay between requests (seconds)
|
|
*
|
|
* @var int
|
|
*/
|
|
private $max_delay = 5;
|
|
|
|
/**
|
|
* Logger instance
|
|
*
|
|
* @var IGSP_Logger
|
|
*/
|
|
private $logger;
|
|
|
|
/**
|
|
* Constructor
|
|
*/
|
|
public function __construct()
|
|
{
|
|
$this->timeout = (int) get_option('igsp_request_timeout', 30);
|
|
$this->logger = new IGSP_Logger();
|
|
}
|
|
|
|
/**
|
|
* Fetch Instagram profile data
|
|
*
|
|
* @param string $username Instagram username
|
|
* @return array|WP_Error Array of posts or WP_Error on failure
|
|
*/
|
|
public function fetch_profile_data($username)
|
|
{
|
|
$username = sanitize_text_field($username);
|
|
|
|
if (empty($username)) {
|
|
return new WP_Error('no_username', __('Username not provided.', 'instagram-gallery-sync-pro'));
|
|
}
|
|
|
|
// Clean username (remove @ if present)
|
|
$username = ltrim($username, '@');
|
|
|
|
$this->logger->info(sprintf(__('Starting fetch for username: %s', 'instagram-gallery-sync-pro'), $username));
|
|
|
|
// Try multiple methods in order
|
|
$methods = array(
|
|
'fetch_via_web_profile_info',
|
|
'fetch_via_embed_page',
|
|
'fetch_via_profile_page',
|
|
);
|
|
|
|
$last_error = null;
|
|
|
|
foreach ($methods as $method) {
|
|
$this->random_delay();
|
|
|
|
$result = $this->$method($username);
|
|
|
|
if (!is_wp_error($result) && !empty($result)) {
|
|
$this->logger->success(
|
|
sprintf(__('Successfully fetched %d posts using %s', 'instagram-gallery-sync-pro'), count($result), $method),
|
|
array('method' => $method, 'count' => count($result))
|
|
);
|
|
return $result;
|
|
}
|
|
|
|
if (is_wp_error($result)) {
|
|
$last_error = $result;
|
|
$this->logger->warning(
|
|
sprintf(__('Method %s failed: %s', 'instagram-gallery-sync-pro'), $method, $result->get_error_message()),
|
|
array('method' => $method)
|
|
);
|
|
}
|
|
}
|
|
|
|
// All methods failed
|
|
$error = $last_error ?? new WP_Error('fetch_failed', __('All scraping methods failed.', 'instagram-gallery-sync-pro'));
|
|
$this->logger->error(__('Failed to fetch Instagram data after trying all methods.', 'instagram-gallery-sync-pro'));
|
|
|
|
return $error;
|
|
}
|
|
|
|
/**
|
|
* Fetch via web profile info API
|
|
* This uses the internal API that Instagram's web app uses
|
|
*
|
|
* @param string $username Instagram username
|
|
* @return array|WP_Error
|
|
*/
|
|
private function fetch_via_web_profile_info($username)
|
|
{
|
|
// First, we need to get the app id and other required headers from the main page
|
|
$main_url = 'https://www.instagram.com/';
|
|
$main_response = $this->make_request($main_url);
|
|
|
|
if (is_wp_error($main_response)) {
|
|
return $main_response;
|
|
}
|
|
|
|
$main_body = wp_remote_retrieve_body($main_response);
|
|
|
|
// Extract the app ID
|
|
$app_id = $this->extract_app_id($main_body);
|
|
|
|
$this->random_delay();
|
|
|
|
// Now fetch the profile using the web profile info endpoint
|
|
$url = 'https://www.instagram.com/api/v1/users/web_profile_info/?username=' . urlencode($username);
|
|
|
|
$headers = array(
|
|
'X-IG-App-ID' => $app_id ?: '936619743392459',
|
|
'X-ASBD-ID' => '129477',
|
|
'X-IG-WWW-Claim' => '0',
|
|
'X-Requested-With' => 'XMLHttpRequest',
|
|
'Referer' => 'https://www.instagram.com/' . $username . '/',
|
|
'Accept' => '*/*',
|
|
);
|
|
|
|
$response = $this->make_request($url, array('headers' => $headers));
|
|
|
|
if (is_wp_error($response)) {
|
|
return $response;
|
|
}
|
|
|
|
$body = wp_remote_retrieve_body($response);
|
|
$data = json_decode($body, true);
|
|
|
|
if (json_last_error() !== JSON_ERROR_NONE) {
|
|
return new WP_Error('json_error', __('Invalid JSON response from web profile info.', 'instagram-gallery-sync-pro'));
|
|
}
|
|
|
|
// Check for user data
|
|
if (empty($data['data']['user'])) {
|
|
return new WP_Error('no_user', __('No user data found.', 'instagram-gallery-sync-pro'));
|
|
}
|
|
|
|
$user = $data['data']['user'];
|
|
|
|
// Check if profile is private
|
|
if (!empty($user['is_private'])) {
|
|
return new WP_Error('private_profile', __('This Instagram profile is private.', 'instagram-gallery-sync-pro'));
|
|
}
|
|
|
|
// Get media
|
|
$edges = $user['edge_owner_to_timeline_media']['edges'] ?? array();
|
|
|
|
if (empty($edges)) {
|
|
return new WP_Error('no_media', __('No media found on this profile.', 'instagram-gallery-sync-pro'));
|
|
}
|
|
|
|
return $this->parse_graphql_edges($edges, $username);
|
|
}
|
|
|
|
/**
|
|
* Fetch via embed page (iframeable content)
|
|
*
|
|
* @param string $username Instagram username
|
|
* @return array|WP_Error
|
|
*/
|
|
private function fetch_via_embed_page($username)
|
|
{
|
|
// First get the profile to find some post shortcodes
|
|
$profile_url = 'https://www.instagram.com/' . $username . '/';
|
|
$response = $this->make_request($profile_url);
|
|
|
|
if (is_wp_error($response)) {
|
|
return $response;
|
|
}
|
|
|
|
$body = wp_remote_retrieve_body($response);
|
|
|
|
// Try to extract shortcodes from the HTML
|
|
$shortcodes = $this->extract_shortcodes($body);
|
|
|
|
if (empty($shortcodes)) {
|
|
return new WP_Error('no_shortcodes', __('Could not find any post links.', 'instagram-gallery-sync-pro'));
|
|
}
|
|
|
|
$posts = array();
|
|
$max_images = min((int) get_option('igsp_max_images', 12), count($shortcodes));
|
|
|
|
// Fetch each post via embed
|
|
for ($i = 0; $i < $max_images; $i++) {
|
|
$this->random_delay(1, 2);
|
|
|
|
$embed_url = 'https://www.instagram.com/p/' . $shortcodes[$i] . '/embed/';
|
|
$embed_response = $this->make_request($embed_url);
|
|
|
|
if (is_wp_error($embed_response)) {
|
|
continue;
|
|
}
|
|
|
|
$embed_body = wp_remote_retrieve_body($embed_response);
|
|
$post_data = $this->parse_embed_page($embed_body, $shortcodes[$i], $username);
|
|
|
|
if ($post_data) {
|
|
$posts[] = $post_data;
|
|
}
|
|
}
|
|
|
|
if (empty($posts)) {
|
|
return new WP_Error('no_posts', __('Could not extract any posts from embeds.', 'instagram-gallery-sync-pro'));
|
|
}
|
|
|
|
return $posts;
|
|
}
|
|
|
|
/**
|
|
* Fetch via profile page HTML parsing (fallback)
|
|
*
|
|
* @param string $username Instagram username
|
|
* @return array|WP_Error
|
|
*/
|
|
private function fetch_via_profile_page($username)
|
|
{
|
|
$url = 'https://www.instagram.com/' . $username . '/';
|
|
|
|
$response = $this->make_request($url);
|
|
|
|
if (is_wp_error($response)) {
|
|
return $response;
|
|
}
|
|
|
|
$body = wp_remote_retrieve_body($response);
|
|
|
|
if (empty($body)) {
|
|
return new WP_Error('empty_response', __('Empty response from Instagram.', 'instagram-gallery-sync-pro'));
|
|
}
|
|
|
|
// Try multiple parsing strategies
|
|
$posts = $this->parse_require_js_data($body, $username);
|
|
|
|
if (!empty($posts)) {
|
|
return $posts;
|
|
}
|
|
|
|
$posts = $this->parse_shared_data($body, $username);
|
|
|
|
if (!empty($posts)) {
|
|
return $posts;
|
|
}
|
|
|
|
$posts = $this->parse_preloaded_data($body, $username);
|
|
|
|
if (!empty($posts)) {
|
|
return $posts;
|
|
}
|
|
|
|
$posts = $this->parse_og_images($body, $username);
|
|
|
|
if (!empty($posts)) {
|
|
return $posts;
|
|
}
|
|
|
|
return new WP_Error('parse_failed', __('Could not parse Instagram profile page.', 'instagram-gallery-sync-pro'));
|
|
}
|
|
|
|
/**
|
|
* Extract app ID from Instagram main page
|
|
*
|
|
* @param string $html HTML content
|
|
* @return string|null
|
|
*/
|
|
private function extract_app_id($html)
|
|
{
|
|
// Look for the app ID in various places
|
|
$patterns = array(
|
|
'/"X-IG-App-ID":"(\d+)"/',
|
|
'/appId":"(\d+)"/',
|
|
'/APP_ID":"(\d+)"/',
|
|
'/{"APP_ID":"(\d+)"/',
|
|
);
|
|
|
|
foreach ($patterns as $pattern) {
|
|
if (preg_match($pattern, $html, $matches)) {
|
|
return $matches[1];
|
|
}
|
|
}
|
|
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* Extract shortcodes from profile HTML
|
|
*
|
|
* @param string $html HTML content
|
|
* @return array
|
|
*/
|
|
private function extract_shortcodes($html)
|
|
{
|
|
$shortcodes = array();
|
|
|
|
// Pattern to find post links
|
|
$patterns = array(
|
|
'/\/p\/([A-Za-z0-9_-]+)\//',
|
|
'/\/reel\/([A-Za-z0-9_-]+)\//',
|
|
'/"shortcode":"([A-Za-z0-9_-]+)"/',
|
|
'/"code":"([A-Za-z0-9_-]+)"/',
|
|
);
|
|
|
|
foreach ($patterns as $pattern) {
|
|
if (preg_match_all($pattern, $html, $matches)) {
|
|
$shortcodes = array_merge($shortcodes, $matches[1]);
|
|
}
|
|
}
|
|
|
|
// Remove duplicates and return
|
|
return array_unique($shortcodes);
|
|
}
|
|
|
|
/**
|
|
* Parse embed page HTML
|
|
*
|
|
* @param string $html HTML content
|
|
* @param string $shortcode Post shortcode
|
|
* @param string $username Username
|
|
* @return array|null
|
|
*/
|
|
private function parse_embed_page($html, $shortcode, $username)
|
|
{
|
|
// Look for image URL in embed page
|
|
$image_url = null;
|
|
$caption = '';
|
|
|
|
// Try to find the main image
|
|
$img_patterns = array(
|
|
'/<img[^>]+class="[^"]*EmbeddedMediaImage[^"]*"[^>]+src="([^"]+)"/',
|
|
'/<img[^>]+src="([^"]+instagram[^"]+)"[^>]+class="[^"]*Image/',
|
|
'/property="og:image"[^>]+content="([^"]+)"/',
|
|
'/content="([^"]+)"[^>]+property="og:image"/',
|
|
'/<img[^>]+srcset="([^"]+)"/',
|
|
);
|
|
|
|
foreach ($img_patterns as $pattern) {
|
|
if (preg_match($pattern, $html, $matches)) {
|
|
$image_url = $matches[1];
|
|
// Handle srcset - get the first URL
|
|
if (strpos($image_url, ' ') !== false) {
|
|
$image_url = explode(' ', $image_url)[0];
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
// Try to extract from JSON in the page
|
|
if (empty($image_url)) {
|
|
if (preg_match('/"display_url":"([^"]+)"/', $html, $matches)) {
|
|
$image_url = stripcslashes($matches[1]);
|
|
}
|
|
}
|
|
|
|
if (empty($image_url)) {
|
|
return null;
|
|
}
|
|
|
|
// Extract caption
|
|
if (preg_match('/<div class="[^"]*Caption[^"]*"[^>]*>.*?<span[^>]*>(.+?)<\/span>/s', $html, $matches)) {
|
|
$caption = strip_tags($matches[1]);
|
|
}
|
|
|
|
// Try to get timestamp
|
|
$timestamp = null;
|
|
if (preg_match('/datetime="([^"]+)"/', $html, $matches)) {
|
|
$timestamp = strtotime($matches[1]);
|
|
}
|
|
|
|
return array(
|
|
'instagram_id' => md5($shortcode . $username),
|
|
'shortcode' => $shortcode,
|
|
'username' => $username,
|
|
'image_url' => $image_url,
|
|
'thumbnail_url' => $image_url,
|
|
'post_url' => 'https://www.instagram.com/p/' . $shortcode . '/',
|
|
'caption' => $caption,
|
|
'likes_count' => null,
|
|
'comments_count' => null,
|
|
'posted_at' => $timestamp ? date('Y-m-d H:i:s', $timestamp) : current_time('mysql'),
|
|
'image_width' => 0,
|
|
'image_height' => 0,
|
|
'is_video' => false,
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Parse require.js data from modern Instagram
|
|
*
|
|
* @param string $html HTML content
|
|
* @param string $username Username
|
|
* @return array
|
|
*/
|
|
private function parse_require_js_data($html, $username)
|
|
{
|
|
// Modern Instagram uses require.js with data in script tags
|
|
$pattern = '/requireLazy\(\["JSScheduler"[^}]+},\s*function\(\)[^{]*{\s*"use strict";\s*(.+?)\s*}\s*\)/s';
|
|
|
|
// Try to find JSON data in script tags
|
|
$json_patterns = array(
|
|
'/"xdt_api__v1__feed__user_timeline_graphql_connection":\s*({.+?})\s*,\s*"extensions"/',
|
|
'/"edge_owner_to_timeline_media":\s*({.+?})\s*,\s*"edge_/',
|
|
'/{"xdt_api__v1__users__web_profile_info".*?"user":\s*({.+?})\s*}\s*}/',
|
|
);
|
|
|
|
foreach ($json_patterns as $pattern) {
|
|
if (preg_match($pattern, $html, $matches)) {
|
|
$data = json_decode($matches[1], true);
|
|
if (json_last_error() === JSON_ERROR_NONE && !empty($data)) {
|
|
$edges = $data['edges'] ?? array();
|
|
if (!empty($edges)) {
|
|
return $this->parse_graphql_edges($edges, $username);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return array();
|
|
}
|
|
|
|
/**
|
|
* Parse preloaded data from script tags
|
|
*
|
|
* @param string $html HTML content
|
|
* @param string $username Username
|
|
* @return array
|
|
*/
|
|
private function parse_preloaded_data($html, $username)
|
|
{
|
|
// Look for preloaded data
|
|
$patterns = array(
|
|
'/window\.__additionalDataLoaded\s*\(\s*[\'"][^\'"]+[\'"]\s*,\s*({.+?})\s*\)\s*;/s',
|
|
'/window\._sharedData\s*=\s*({.+?});/s',
|
|
'/<script type="application\/json"[^>]*>(\{.+?\})<\/script>/s',
|
|
);
|
|
|
|
foreach ($patterns as $pattern) {
|
|
if (preg_match_all($pattern, $html, $matches)) {
|
|
foreach ($matches[1] as $json_str) {
|
|
$data = json_decode($json_str, true);
|
|
|
|
if (json_last_error() !== JSON_ERROR_NONE) {
|
|
continue;
|
|
}
|
|
|
|
// Try to find edges in various paths
|
|
$edges = $this->find_edges_in_data($data);
|
|
|
|
if (!empty($edges)) {
|
|
return $this->parse_graphql_edges($edges, $username);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return array();
|
|
}
|
|
|
|
/**
|
|
* Find edges in nested data structure
|
|
*
|
|
* @param array $data Data array
|
|
* @return array
|
|
*/
|
|
private function find_edges_in_data($data)
|
|
{
|
|
if (!is_array($data)) {
|
|
return array();
|
|
}
|
|
|
|
// Direct path check
|
|
$paths = array(
|
|
array('entry_data', 'ProfilePage', 0, 'graphql', 'user', 'edge_owner_to_timeline_media', 'edges'),
|
|
array('graphql', 'user', 'edge_owner_to_timeline_media', 'edges'),
|
|
array('data', 'user', 'edge_owner_to_timeline_media', 'edges'),
|
|
array('user', 'edge_owner_to_timeline_media', 'edges'),
|
|
array('edge_owner_to_timeline_media', 'edges'),
|
|
array('require', 0, 3, 0, '__bbox', 'require', 0, 3, 1, '__bbox', 'result', 'data', 'xdt_api__v1__feed__user_timeline_graphql_connection', 'edges'),
|
|
);
|
|
|
|
foreach ($paths as $path) {
|
|
$result = $data;
|
|
foreach ($path as $key) {
|
|
if (is_array($result) && isset($result[$key])) {
|
|
$result = $result[$key];
|
|
} else {
|
|
$result = null;
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!empty($result) && is_array($result)) {
|
|
return $result;
|
|
}
|
|
}
|
|
|
|
// Recursive search for edges key
|
|
return $this->recursive_find_edges($data);
|
|
}
|
|
|
|
/**
|
|
* Recursively search for edges in data
|
|
*
|
|
* @param array $data Data array
|
|
* @param int $depth Current depth
|
|
* @return array
|
|
*/
|
|
private function recursive_find_edges($data, $depth = 0)
|
|
{
|
|
if ($depth > 10 || !is_array($data)) {
|
|
return array();
|
|
}
|
|
|
|
foreach ($data as $key => $value) {
|
|
if ($key === 'edges' && is_array($value) && !empty($value)) {
|
|
// Validate it looks like media edges
|
|
if (isset($value[0]['node']['display_url']) || isset($value[0]['node']['thumbnail_src'])) {
|
|
return $value;
|
|
}
|
|
}
|
|
|
|
if (is_array($value)) {
|
|
$result = $this->recursive_find_edges($value, $depth + 1);
|
|
if (!empty($result)) {
|
|
return $result;
|
|
}
|
|
}
|
|
}
|
|
|
|
return array();
|
|
}
|
|
|
|
/**
|
|
* Parse OG images from page as fallback
|
|
*
|
|
* @param string $html HTML content
|
|
* @param string $username Username
|
|
* @return array
|
|
*/
|
|
private function parse_og_images($html, $username)
|
|
{
|
|
$posts = array();
|
|
|
|
// Get OG image
|
|
if (preg_match('/property="og:image"[^>]+content="([^"]+)"/', $html, $matches)) {
|
|
$image_url = $matches[1];
|
|
|
|
// This is just the profile picture or first post, but it's something
|
|
$posts[] = array(
|
|
'instagram_id' => md5($username . '_og_' . time()),
|
|
'shortcode' => '',
|
|
'username' => $username,
|
|
'image_url' => $image_url,
|
|
'thumbnail_url' => $image_url,
|
|
'post_url' => 'https://www.instagram.com/' . $username . '/',
|
|
'caption' => '',
|
|
'likes_count' => null,
|
|
'comments_count' => null,
|
|
'posted_at' => current_time('mysql'),
|
|
'image_width' => 0,
|
|
'image_height' => 0,
|
|
'is_video' => false,
|
|
);
|
|
}
|
|
|
|
// Also try to find any image URLs that look like Instagram CDN
|
|
$pattern = '/(https:\/\/[^"\']+?instagram[^"\']+?\.jpg[^"\']*)/i';
|
|
if (preg_match_all($pattern, $html, $matches)) {
|
|
$seen = array();
|
|
foreach (array_slice(array_unique($matches[1]), 0, 12) as $idx => $img_url) {
|
|
// Skip profile pictures and very small images
|
|
if (strpos($img_url, '_a.jpg') !== false || strpos($img_url, '150x150') !== false) {
|
|
continue;
|
|
}
|
|
|
|
$hash = md5($img_url);
|
|
if (isset($seen[$hash])) {
|
|
continue;
|
|
}
|
|
$seen[$hash] = true;
|
|
|
|
$posts[] = array(
|
|
'instagram_id' => md5($username . '_img_' . $idx),
|
|
'shortcode' => '',
|
|
'username' => $username,
|
|
'image_url' => $img_url,
|
|
'thumbnail_url' => $img_url,
|
|
'post_url' => 'https://www.instagram.com/' . $username . '/',
|
|
'caption' => '',
|
|
'likes_count' => null,
|
|
'comments_count' => null,
|
|
'posted_at' => current_time('mysql'),
|
|
'image_width' => 0,
|
|
'image_height' => 0,
|
|
'is_video' => false,
|
|
);
|
|
}
|
|
}
|
|
|
|
return $posts;
|
|
}
|
|
|
|
/**
|
|
* Parse window._sharedData from HTML
|
|
*
|
|
* @param string $html HTML content
|
|
* @param string $username Username
|
|
* @return array
|
|
*/
|
|
private function parse_shared_data($html, $username)
|
|
{
|
|
$pattern = '/window\._sharedData\s*=\s*({.+?});/s';
|
|
|
|
if (!preg_match($pattern, $html, $matches)) {
|
|
return array();
|
|
}
|
|
|
|
$data = json_decode($matches[1], true);
|
|
|
|
if (json_last_error() !== JSON_ERROR_NONE) {
|
|
return array();
|
|
}
|
|
|
|
// Navigate to media edges
|
|
$edges = $data['entry_data']['ProfilePage'][0]['graphql']['user']['edge_owner_to_timeline_media']['edges'] ?? array();
|
|
|
|
if (empty($edges)) {
|
|
return array();
|
|
}
|
|
|
|
return $this->parse_graphql_edges($edges, $username);
|
|
}
|
|
|
|
/**
|
|
* Parse GraphQL edges into normalized post data
|
|
*
|
|
* @param array $edges GraphQL edges
|
|
* @param string $username Username
|
|
* @return array
|
|
*/
|
|
private function parse_graphql_edges($edges, $username)
|
|
{
|
|
$posts = array();
|
|
$max_images = (int) get_option('igsp_max_images', 12);
|
|
|
|
foreach ($edges as $index => $edge) {
|
|
if ($index >= $max_images) {
|
|
break;
|
|
}
|
|
|
|
$node = $edge['node'] ?? $edge;
|
|
|
|
if (empty($node)) {
|
|
continue;
|
|
}
|
|
|
|
$post = array(
|
|
'instagram_id' => $node['id'] ?? $node['pk'] ?? md5(json_encode($node)),
|
|
'shortcode' => $node['shortcode'] ?? $node['code'] ?? '',
|
|
'username' => $username,
|
|
'image_url' => $this->get_best_image_url($node),
|
|
'thumbnail_url' => $node['thumbnail_src'] ?? $node['display_url'] ?? $node['image_versions2']['candidates'][0]['url'] ?? '',
|
|
'post_url' => 'https://www.instagram.com/p/' . ($node['shortcode'] ?? $node['code'] ?? '') . '/',
|
|
'caption' => $this->extract_caption($node),
|
|
'likes_count' => $node['edge_liked_by']['count'] ?? $node['edge_media_preview_like']['count'] ?? $node['like_count'] ?? null,
|
|
'comments_count' => $node['edge_media_to_comment']['count'] ?? $node['edge_media_preview_comment']['count'] ?? $node['comment_count'] ?? null,
|
|
'posted_at' => isset($node['taken_at_timestamp']) ? date('Y-m-d H:i:s', $node['taken_at_timestamp']) : (isset($node['taken_at']) ? date('Y-m-d H:i:s', $node['taken_at']) : current_time('mysql')),
|
|
'image_width' => $node['dimensions']['width'] ?? $node['original_width'] ?? 0,
|
|
'image_height' => $node['dimensions']['height'] ?? $node['original_height'] ?? 0,
|
|
'is_video' => $node['is_video'] ?? $node['media_type'] === 2 ?? false,
|
|
);
|
|
|
|
// Skip videos if not supported
|
|
if (!empty($post['is_video'])) {
|
|
continue;
|
|
}
|
|
|
|
// Only add if we have an image
|
|
if (!empty($post['image_url'])) {
|
|
$posts[] = $post;
|
|
}
|
|
}
|
|
|
|
return $posts;
|
|
}
|
|
|
|
/**
|
|
* Get best image URL based on quality setting
|
|
*
|
|
* @param array $node Node data
|
|
* @return string
|
|
*/
|
|
private function get_best_image_url($node)
|
|
{
|
|
$quality = get_option('igsp_image_quality', 'high');
|
|
|
|
// Check for image_versions2 format (newer API)
|
|
if (!empty($node['image_versions2']['candidates'])) {
|
|
$candidates = $node['image_versions2']['candidates'];
|
|
switch ($quality) {
|
|
case 'thumbnail':
|
|
return end($candidates)['url'] ?? $candidates[0]['url'];
|
|
case 'medium':
|
|
$mid = floor(count($candidates) / 2);
|
|
return $candidates[$mid]['url'] ?? $candidates[0]['url'];
|
|
case 'high':
|
|
default:
|
|
return $candidates[0]['url'];
|
|
}
|
|
}
|
|
|
|
switch ($quality) {
|
|
case 'thumbnail':
|
|
return $node['thumbnail_src'] ?? $node['thumbnail_resources'][0]['src'] ?? $node['display_url'] ?? '';
|
|
|
|
case 'medium':
|
|
$resources = $node['thumbnail_resources'] ?? array();
|
|
$mid_index = (int) floor(count($resources) / 2);
|
|
return $resources[$mid_index]['src'] ?? $node['display_url'] ?? '';
|
|
|
|
case 'high':
|
|
default:
|
|
return $node['display_url'] ?? $node['thumbnail_src'] ?? '';
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Extract caption from node
|
|
*
|
|
* @param array $node Node data
|
|
* @return string
|
|
*/
|
|
private function extract_caption($node)
|
|
{
|
|
$caption = '';
|
|
|
|
if (isset($node['edge_media_to_caption']['edges'][0]['node']['text'])) {
|
|
$caption = $node['edge_media_to_caption']['edges'][0]['node']['text'];
|
|
} elseif (isset($node['caption']['text'])) {
|
|
$caption = $node['caption']['text'];
|
|
} elseif (isset($node['caption']) && is_string($node['caption'])) {
|
|
$caption = $node['caption'];
|
|
}
|
|
|
|
return wp_kses_post($caption);
|
|
}
|
|
|
|
/**
|
|
* Make HTTP request
|
|
*
|
|
* @param string $url URL to request
|
|
* @param array $args Additional arguments
|
|
* @return array|WP_Error
|
|
*/
|
|
private function make_request($url, $args = array())
|
|
{
|
|
$custom_ua = get_option('igsp_user_agent', '');
|
|
$user_agent = !empty($custom_ua) ? $custom_ua : $this->get_random_user_agent();
|
|
|
|
$default_args = array(
|
|
'timeout' => $this->timeout,
|
|
'user-agent' => $user_agent,
|
|
'headers' => array(
|
|
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
|
|
'Accept-Language' => 'en-US,en;q=0.9',
|
|
'Accept-Encoding' => 'gzip, deflate, br',
|
|
'Connection' => 'keep-alive',
|
|
'Upgrade-Insecure-Requests' => '1',
|
|
'Sec-Fetch-Dest' => 'document',
|
|
'Sec-Fetch-Mode' => 'navigate',
|
|
'Sec-Fetch-Site' => 'none',
|
|
'Sec-Fetch-User' => '?1',
|
|
'Sec-Ch-Ua' => '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
|
|
'Sec-Ch-Ua-Mobile' => '?0',
|
|
'Sec-Ch-Ua-Platform' => '"Windows"',
|
|
'Cache-Control' => 'max-age=0',
|
|
),
|
|
'sslverify' => true,
|
|
'cookies' => array(),
|
|
);
|
|
|
|
// Merge custom headers
|
|
if (isset($args['headers'])) {
|
|
$args['headers'] = array_merge($default_args['headers'], $args['headers']);
|
|
}
|
|
|
|
$args = wp_parse_args($args, $default_args);
|
|
|
|
// Add proxy if configured
|
|
$proxy_host = get_option('igsp_proxy_host', '');
|
|
$proxy_port = get_option('igsp_proxy_port', '');
|
|
|
|
if (!empty($proxy_host) && !empty($proxy_port)) {
|
|
$args['proxy'] = $proxy_host . ':' . $proxy_port;
|
|
}
|
|
|
|
// Retry logic
|
|
$last_error = null;
|
|
|
|
for ($attempt = 1; $attempt <= $this->max_retries; $attempt++) {
|
|
$response = wp_remote_get($url, $args);
|
|
|
|
if (!is_wp_error($response)) {
|
|
$status_code = wp_remote_retrieve_response_code($response);
|
|
|
|
if ($status_code === 200) {
|
|
return $response;
|
|
}
|
|
|
|
if ($status_code === 429) {
|
|
// Rate limited - wait longer
|
|
$this->logger->warning(__('Rate limited by Instagram, waiting...', 'instagram-gallery-sync-pro'));
|
|
sleep(30);
|
|
} elseif ($status_code === 302 || $status_code === 301) {
|
|
// Follow redirect
|
|
$location = wp_remote_retrieve_header($response, 'location');
|
|
if (!empty($location)) {
|
|
$this->random_delay(1, 2);
|
|
return $this->make_request($location, $args);
|
|
}
|
|
} elseif ($status_code >= 400) {
|
|
$last_error = new WP_Error(
|
|
'http_error',
|
|
sprintf(__('HTTP Error: %d', 'instagram-gallery-sync-pro'), $status_code)
|
|
);
|
|
}
|
|
} else {
|
|
$last_error = $response;
|
|
}
|
|
|
|
if ($attempt < $this->max_retries) {
|
|
$this->random_delay($this->min_delay * $attempt, $this->max_delay * $attempt);
|
|
}
|
|
}
|
|
|
|
return $last_error ?? new WP_Error('request_failed', __('Request failed after retries.', 'instagram-gallery-sync-pro'));
|
|
}
|
|
|
|
/**
|
|
* Get random User-Agent
|
|
*
|
|
* @return string
|
|
*/
|
|
private function get_random_user_agent()
|
|
{
|
|
return $this->user_agents[array_rand($this->user_agents)];
|
|
}
|
|
|
|
/**
|
|
* Random delay between requests
|
|
*
|
|
* @param int $min Minimum seconds
|
|
* @param int $max Maximum seconds
|
|
* @return void
|
|
*/
|
|
private function random_delay($min = null, $max = null)
|
|
{
|
|
$min = $min ?? $this->min_delay;
|
|
$max = $max ?? $this->max_delay;
|
|
|
|
$delay = rand($min * 1000000, $max * 1000000); // Microseconds
|
|
usleep($delay);
|
|
}
|
|
|
|
/**
|
|
* Validate fetched data
|
|
*
|
|
* @param array $data Post data
|
|
* @return bool
|
|
*/
|
|
public function validate_post_data($data)
|
|
{
|
|
if (empty($data['instagram_id'])) {
|
|
return false;
|
|
}
|
|
|
|
if (empty($data['image_url'])) {
|
|
return false;
|
|
}
|
|
|
|
// Validate URL format
|
|
if (!filter_var($data['image_url'], FILTER_VALIDATE_URL)) {
|
|
return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
}
|