With support for Advanced API and China as well
use pip to install:
pip install octoparse
3 methods are supported as below:
Include the following as environment variables:
export OCTOPARSE_USERNAME=octoparse_user
export OCTOPARSE_PASSWORD=octoparse_passwd
Include the following in .env
file in script directory:
OCTOPARSE_USERNAME=octoparse_user
OCTOPARSE_PASSWORD=octoparse_passwd
Input username & password manually once from prompt:
Enter Octoparse Username: octoparse_user
Password:
from octoparse import Octoparse
# initialize api client
# it will try to log in & ask for credentials if required
octo = Octoparse()
# if using advanced API:
octo = Octoparse(advanced_api=True)
# if using from China:
octo = Octoparse(china=True)
# List all task groups
groups = octo.list_all_task_groups()
# List all tasks in a group
tasks = octo.list_all_tasks_in_group(group_id='xxxx-ssdsd-1212')
# Check if a task is currently running. This isn't provided in Standard API.
status = octo.is_task_running(task_id='abcd-1234-djfsd-dfdf')
# Export the not exported data
data = octo.get_not_exported_data(task_id='abcd-1234-djfsd-dfdf', size=100)
# Update data status
resp = octo.update_data_status(task_id='abcd-1234-djfsd-dfdf')
# get all the data for a task with task id: 'abcd-1234-djfsd-dfdf'
data = octo.get_task_data(task_id='abcd-1234-djfsd-dfdf')
# get all the task data as a pandas.DataFrame for a task with task id: 'abcd-1234-djfsd-dfdf'
df = octo.get_task_data_df(task_id='abcd-1234-djfsd-dfdf')
# get an offset of data for a task with task id: 'abcd-1234-djfsd-dfdf'
# e.g get 100 rows starting from 200
data = octo.get_data_by_offset(task_id='abcd-1234-djfsd-dfdf', offset=200, size=100)
# fetch task data in a loop using the generator function:
for data in octo.get_task_data_generator(task_id='abcd-1234-djfsd-dfdf', offset=200, size=100):
print(data)
do_something_with_data()
# clear data for a task with task id: 'abcd-1234-djfsd-dfdf'
octo.clear_task_data(task_id='abcd-1234-djfsd-dfdf')
# Get Tasks' status
task_list = ['abcd-1234-djfsd-dfdf', 'ab23-5677-djfsd-dfdf']
resp = octo.get_task_status(task_list)
# Get Task's parameter
resp = octo.get_task_param(task_id='abcd-1234-djfsd-dfdf', name='loopAction1.Url')
# Update Task's parameter
resp = octo.update_task_param(task_id='abcd-1234-djfsd-dfdf', name='loopAction1.Url', value='http://xyz.abc')
# Add new URLs/text to an existing loop
resp = octo.add_url_text_to_loop(task_id='abcd-1234-djfsd-dfdf', name='loopAction1.Url', value='http://xyz.abc')
# Start running task
resp = octo.start_task(task_id='abcd-1234-djfsd-dfdf')
# Stop running task
resp = octo.stop_task(task_id='abcd-1234-djfsd-dfdf')