Download Multiple URL’s FAST with cURL
Made 10,000 GET requests for 1,000 files from 500 different hosts (top 500 alexa sites) in 1 min 44 seconds. Saved 4,500 files. 500 file handles open at one time.
<?php
ini_set('error_reporting', 2147483647 );
function askapache_curl_multi( $args = array() ) {
ISCLOG::ti();
// save the start time
$started = time();
$defaults = array(
'urls' => array(), // array containing all the urls to fetch
'batch' => 1000, // fetch this many urls concurrently (don't do more than 200 if using savedir)
'max_time' => ( 60 * 6 ), // maximum time allowed to complete all requests. 5 minutes
'max_request_time' => 10, // maximum time an individual request will last before being closed. 2 minutes
'max_connect_time' => 0, // The number of seconds to wait while trying to connect. Use 0 to wait indefinitely.
'max_redirs' => 2, // Number of redirects allowed
'user_agent' => 'AskApache;', // user-agent
'headers' => array( 'Accept-Encoding: none' ), // array of http headers, such as array( 'Cookie: thiscookie', 'Accept-Encoding: none' )
'logfile' => '',
'debug' => false,
'save' => false,
'savedir' => '',
'savelog' => '',
);
$args = array_merge( $defaults, $args );
$urls = $batch = $user_agent = $headers = $logfile = $debug = $save = $savedir = $savelog = null;
$max_time = $max_request_time = $max_connect_time = $max_redirs = null;
extract( $args, EXTR_IF_EXISTS );
// Do not abort script execution if a client disconnects
//ignore_user_abort( true );
// Set the number of seconds a script is allowed to run. Restarts the timeout counter from zero.
//set_time_limit( $max_time );
$fplog = $fpsavelog = null;
if ( $debug ) $fplog = fopen( $logfile, 'a');
// setup saving
if ( $save ) {
if ( empty( $savedir ) ) {
$save = false;
} else {
$savedir = rtrim( $savedir, '/' ) . '/';
if ( ! is_dir( $savedir ) ) {
$save = false;
} else {
// set savelog containing the mapping of urls to files
if ( empty( $savelog ) ) $savelog = $savedir . '__' . date( 'Y-m-d' ) . '_urls-to-files-map.log';
// open save log
$fpsavelog = fopen( $savelog, 'a');
if ( ! is_resource( $fpsavelog ) ) $save = false;
}
}
}
// can't follow redirects when open_basedir is in effect
if ( strlen( ini_get( 'open_basedir' ) ) > 0 ) $max_redirs = 0;
$total_urls = count( $urls );
foreach ( array_chunk( $urls, $batch, true ) as $the_urls ) {
$con = $fps = $chinfo = array();
$url_count = count( $the_urls );
$runtime = ( time() - $started );
ISCLOG::ti( "BATCH: {$batch} total_urls: {$total_urls}" );
if ( $runtime > $max_time ) {
ISCLOG::ti( ' !!' . " ({$runtime} > {$max_time}) runtime: {$runtime} batch: {$batch} url_count: {$url_count}" );
die( 'CRITICAL!! RUNTIME > MAX_TIME' );
}
$mh = curl_multi_init(); // create a 'multi handle'
curl_multi_setopt( $mh, CURLMOPT_MAXCONNECTS, 20 ); // maximum amount of simultaneously open connections that libcurl may cache. D10.
curl_multi_setopt( $mh, CURLMOPT_PIPELINING, 1 ); // Pipelining as far as possible for this handle. if you add a second request that can use an already existing connection, 2nd request will be "piped"
foreach ( $the_urls as $i => $url ) {
$con[ $i ] = curl_init( $url );
// skip bad urls
if ( ! is_resource( $con[ $i ] ) ) {
ISCLOG::ti( "ERROR!! SKIPPED: {$url}" );
continue;
}
// TRUE to return the transfer as a string of the return value of curl_exec() instead of outputting it out directly.
curl_setopt( $con[ $i ], CURLOPT_RETURNTRANSFER, 1 );
// binary transfer mode
curl_setopt( $con[ $i ], CURLOPT_BINARYTRANSFER, 1 );
if ( $save ) {
// TRUE to return the transfer as a string of the return value of curl_exec() instead of outputting it out directly.
curl_setopt( $con[ $i ], CURLOPT_RETURNTRANSFER, 0 );
$filename = $i . '_' . md5( $url ) . '.file';
$fps[ $i ] = fopen( $savedir . $filename, 'wb' );
// skip error opening handler to file
if ( ! is_resource( $fps[ $i ] ) ) {
ISCLOG::ti( 'ERROR!! SAVING FILE TO: ' . $savedir . $filename . " !! SKIPPED: {$url}" );
continue;
}
// save the filename mapping
fwrite( $fpsavelog, $filename . ' ' . trim( $url ) . "\n" );
// have curl save the file
curl_setopt( $con[ $i ], CURLOPT_FILE, $fps[ $i ] );
}
// The number of seconds to wait while trying to connect. Use 0 to wait indefinitely.
curl_setopt( $con[ $i ], CURLOPT_CONNECTTIMEOUT, $max_connect_time );
// maximum time in seconds that you allow the libcurl transfer operation to take
curl_setopt( $con[ $i ], CURLOPT_TIMEOUT, $max_request_time );
// allow following redirects
if ( $max_redirs > 0 ) curl_setopt( $con[ $i ], CURLOPT_FOLLOWLOCATION, 1 );
// Number of redirects allowed
curl_setopt( $con[ $i ], CURLOPT_MAXREDIRS, $max_redirs );
// TRUE to fail verbosely if the HTTP code returned is greater than or equal to 400. default return the page ignoring the code.
curl_setopt( $con[ $i ], CURLOPT_FAILONERROR, 0 );
// Do not output verbose information.
curl_setopt( $con[ $i ], CURLOPT_VERBOSE, 0 );
if ( $debug && is_resource( $fplog ) ) {
// TRUE to output verbose information. Writes output to STDERR, or the file specified using CURLOPT_STDERR.
curl_setopt( $con[ $i ], CURLOPT_VERBOSE, 1 );
// An alternative location to output errors to instead of STDERR.
curl_setopt( $con[ $i ], CURLOPT_STDERR, $fplog );
//curl_setopt( $con[ $i ], CURLINFO_HEADER_OUT, 1);
}
// A parameter set to 1 tells the library to include the header in the body output.
curl_setopt( $con[ $i ], CURLOPT_HEADER, 0 );
// TRUE to ignore any cURL function that causes a signal sent to the PHP.
// curl_setopt( $con[ $i ], CURLOPT_NOSIGNAL, 1 );
// Ignore the Content-Length header.
// curl_setopt( $con[ $i ], CURLOPT_IGNORE_CONTENT_LENGTH, 1 );
curl_setopt( $con[ $i ], CURLOPT_HTTP_VERSION, CURL_HTTP_VERSION_1_0 );
// TRUE to exclude the body from the output. Request method is then set to HEAD.
// curl_setopt( $con[ $i ], CURLOPT_NOBODY, 1 );
// A custom request method to use instead of "GET" or "HEAD" when doing a HTTP request.
// curl_setopt( $con[ $i ], CURLOPT_CUSTOMREQUEST, 'GET' );
// The User-Agent header
if ( ! empty( $user_agent ) ) curl_setopt( $con[ $i ], CURLOPT_USERAGENT, $user_agent );
// Additional headers to send
if ( count( $headers ) > 0 ) curl_setopt( $con[ $i ], CURLOPT_HTTPHEADER, $headers );
curl_multi_add_handle( $mh, $con[ $i ] ); // add the easy handle to the multi handle 'multi stack' $mh
}
$still_running = null;
do {
//usleep( 50000 );
//usleep( 50000 );
$status = curl_multi_exec( $mh, $still_running );
} while ( $still_running > 0 ); // Processes each of the handles in the stack.
foreach ( $the_urls as $i => $url ) {
if ( ! is_resource( $con[ $i ] ) ) {
ISCLOG::epx( array( 'url' => $url, 'chinfo' => $chinfo, 'curl_errno' => curl_errno( $con[ $i ] ), 'curl_error' => curl_error( $con[ $i ] ) ) );
continue;
}
$code = curl_getinfo( $con[ $i ], CURLINFO_HTTP_CODE );
$rcount = curl_getinfo( $con[ $i ], CURLINFO_REDIRECT_COUNT );
$size = curl_getinfo( $con[ $i ], CURLINFO_SIZE_DOWNLOAD );
//$info = curl_getinfo( $con[ $i ] ); ISCLOG::epx($info);
//if ( $code != 200 || $rcount > $max_redirs || curl_errno( $con[ $i ] ) ) {
if ( $rcount > $max_redirs || curl_errno( $con[ $i ] ) || $size <= 0 ) {
$chinfo = curl_getinfo( $con[ $i ] );
ISCLOG::l( curl_error( $con[ $i ] ) );
//sleep( 2 );
if ( $save ) {
if ( is_resource( $fps[ $i ] ) ) fclose( $fps[ $i ] );
if ( is_file( $savedir . $i . '_' . md5( $url ) . '.file' ) ) unlink( $savedir . $i . '_' . md5( $url ) . '.file' );
}
}
curl_multi_remove_handle( $mh, $con[ $i ] ); // remove handle from 'multi stack' $mh
curl_close( $con[ $i ] ); // close the individual handle
}
curl_multi_close( $mh ); // close the multi stack
// close the save file handlers
if ( $save ) {
foreach ( $fps as $fp ) {
if ( is_resource( $fp ) ) fclose( $fp );
}
}
//ISCLOG::ti( "BATCH: {$batch} total_urls: {$total_urls}" );
} // end foreach ( array_chunk( $the_urls, $batch_size, true ) as $urls ) {
if ( is_resource( $fplog ) ) fclose( $fplog ); // close the logfile
if ( is_resource( $fpsavelog ) ) fclose( $fpsavelog ); // close the logfile
echo "\nCOMPLETED IN: " . ( time() - $started ) . " SECONDS\n";
ISCLOG::ti();
ISCLOG::pls( $savedir );
}
!defined( 'ISC_ROOT' ) && define( 'ISC_ROOT', str_replace( '/htdocs', '', $_SERVER['DOCUMENT_ROOT'] ) );
require_once ISC_ROOT . '/inc/isclog.inc.php';
$content = trim( file_get_contents( '/web/askapach/sites/askapache.com/urls.txt' ) );
$urls = explode( "\n", $content );
header( 'Content-Type: text/plain' );
ob_start();
askapache_curl_multi( array(
'savedir' => '/web/askapach/sites/askapache.com/savedir/',
'save' => false,
'urls'=> $urls,
'logfile' => '/web/askapach/sites/askapache.com/logs/multi-curl.log',
'debug' => false,
) );
echo ob_get_clean();
?>
« PortaPutty Auto-Reconnecting SSH Tunnels on an Encrypted TrueCrypt Portable USB Key w GPGFree DNS Services »
Comments