Skip to content

Commit

Permalink
SFR-2249: Refactor and clean up s3 file process (#399)
Browse files Browse the repository at this point in the history
  • Loading branch information
kylevillegas93 authored Oct 16, 2024
1 parent fc1ada9 commit 49c98e4
Show file tree
Hide file tree
Showing 3 changed files with 228 additions and 235 deletions.
159 changes: 79 additions & 80 deletions processes/s3Files.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,112 +18,111 @@ def __init__(self, *args):
super(S3Process, self).__init__(*args[:4])

def runProcess(self):
self.receiveAndProcessMessages()
try:
number_of_processes = 4
file_processes = []

for _ in range(number_of_processes):
file_process = Process(target=S3Process.process_files)
file_process.start()

def receiveAndProcessMessages(self):
processes = 4
epubProcesses = []
for _ in range(processes):
proc = Process(target=S3Process.storeFilesInS3)
proc.start()
epubProcesses.append(proc)
file_processes.append(file_process)

for proc in epubProcesses:
proc.join()
for file_process in file_processes:
file_process.join()
except Exception:
logger.exception('Failed to run S3 Process')

@staticmethod
def storeFilesInS3():
storageManager = S3Manager()
storageManager.createS3Client()

fileQueue = os.environ['FILE_QUEUE']
fileRoute = os.environ['FILE_ROUTING_KEY']
epubConverterURL = os.environ['WEBPUB_CONVERSION_URL']

rabbitManager = RabbitMQManager()
rabbitManager.createRabbitConnection()
rabbitManager.createOrConnectQueue(fileQueue, fileRoute)

bucket = os.environ['FILE_BUCKET']

attempts = 1
while True:
msgProps, _, msgBody = rabbitManager.getMessageFromQueue(fileQueue)
if msgProps is None:
if attempts <= 3:
sleep(30 * attempts)
attempts += 1
continue
def process_files():
storage_manager = S3Manager()
storage_manager.createS3Client()

file_queue = os.environ['FILE_QUEUE']
file_route = os.environ['FILE_ROUTING_KEY']

rabbit_mq_manager = RabbitMQManager()
rabbit_mq_manager.createRabbitConnection()
rabbit_mq_manager.createOrConnectQueue(file_queue, file_route)

s3_file_bucket = os.environ['FILE_BUCKET']

attempts_to_poll = 1
max_poll_attempts = 3

while attempts_to_poll <= max_poll_attempts:
message_props, _, message_body = rabbit_mq_manager.getMessageFromQueue(file_queue)

if not message_props:
if attempts_to_poll <= max_poll_attempts:
wait_time = attempts_to_poll * 30

logger.info(f'Waiting {wait_time}s for S3 file messages')
sleep(wait_time)

attempts_to_poll += 1
else:
logger.info('Exiting S3 process - no more messages.')
break

attempts = 1
continue

attempts_to_poll = 1

fileMeta = json.loads(msgBody)['fileData']
fileURL = fileMeta['fileURL']
filePath = fileMeta['bucketPath']
file_data = json.loads(message_body)['fileData']
file_url = file_data['fileURL']
file_path = file_data['bucketPath']

try:
logger.info('Storing {}'.format(fileURL))
epubB = S3Process.getFileContents(fileURL)

storageManager.putObjectInBucket(epubB, filePath, bucket)
file_contents = S3Process.get_file_contents(file_url)

if '.epub' in filePath:
fileRoot = '.'.join(filePath.split('.')[:-1])
storage_manager.putObjectInBucket(file_contents, file_path, s3_file_bucket)

del file_contents

webpubManifest = S3Process.generateWebpub(
epubConverterURL, fileRoot, bucket
)
if '.epub' in file_path:
file_root = '.'.join(file_path.split('.')[:-1])

storageManager.putObjectInBucket(
webpubManifest,
'{}/manifest.json'.format(fileRoot),
bucket
)
web_pub_manifest = S3Process.generate_webpub(file_root, s3_file_bucket)

rabbitManager.acknowledgeMessageProcessed(msgProps.delivery_tag)
storage_manager.putObjectInBucket(web_pub_manifest, f'{file_root}/manifest.json', s3_file_bucket)

logger.info('Sending Tag {} for {}'.format(fileURL, msgProps.delivery_tag))
rabbit_mq_manager.acknowledgeMessageProcessed(message_props.delivery_tag)

del epubB
except Exception as e:
logger.error('Unable to store file in S3')
logger.debug(e)
logger.info(f'Stored file in S3 for {file_url}')
except Exception:
logger.exception(f'Failed to store file for file url: {file_url}')

@staticmethod
def getFileContents(epubURL):
timeout = 15
epubResp = requests.get(
epubURL,
def get_file_contents(file_url: str):
file_url_response = requests.get(
file_url,
stream=True,
timeout=timeout,
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)'}
timeout=15,
headers={ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)' }
)

if epubResp.status_code == 200:
content = bytes()
for byteChunk in epubResp.iter_content(1024 * 250):
content += byteChunk
if file_url_response.status_code == 200:
file_contents = bytes()

return content
for byte_chunk in file_url_response.iter_content(1024 * 250):
file_contents += byte_chunk

raise Exception('Unable to fetch ePub file')
return file_contents

@staticmethod
def generateWebpub(converterRoot, fileRoot, bucket):
s3Path = 'https://{}.s3.amazonaws.com/{}/META-INF/container.xml'.format(
bucket, fileRoot
)
raise Exception(f'Unable to fetch file from url: {file_url}')

converterURL = '{}/api/{}'.format(converterRoot, quote_plus(s3Path))
@staticmethod
def generate_webpub(file_root, bucket):
webpub_conversion_url = os.environ['WEBPUB_CONVERSION_URL']
s3_file_path = f'https://{bucket}.s3.amazonaws.com/{file_root}/META-INF/container.xml'
webpub_conversion_url = f'{webpub_conversion_url}/api/{quote_plus(s3_file_path)}'

try:
webpubResp = requests.get(converterURL, timeout=15)
webpub_response = requests.get(webpub_conversion_url, timeout=15)

webpubResp.raise_for_status()
webpub_response.raise_for_status()

return webpubResp.content
except Exception as e:
logger.warning('Unable to generate webpub')
logger.debug(e)
return webpub_response.content
except Exception:
logger.exception(f'Failed to generate webpub for {file_root}')
149 changes: 149 additions & 0 deletions tests/unit/test_s3_files_process.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
import pytest
import requests

from tests.helper import TestHelpers
from processes import S3Process


class TestS3Process:
@classmethod
def setup_class(cls):
TestHelpers.setEnvVars()

@classmethod
def teardown_class(cls):
TestHelpers.clearEnvVars()

@pytest.fixture
def test_instance(self, mocker):
class TestS3Process(S3Process):
def __init__(self, process, customFile, ingestPeriod):
self.bucket = 'testBucket'

return TestS3Process('TestProcess', 'testFile', 'testDate')

@pytest.fixture
def test_file_message(self):
return """
{
"fileData": {
"fileURL": "testSourceURL",
"bucketPath": "testBucketPath.epub"
}
}
"""

def test_run_process(self, test_instance, mocker):
mock_process_files = mocker.patch.object(S3Process, 'process_files')
mock_save_records = mocker.patch.object(S3Process, 'saveRecords')
mock_commit_changes = mocker.patch.object(S3Process, 'commitChanges')
mock_file_process = mocker.MagicMock()
mock_process = mocker.patch('processes.s3Files.Process')
mock_process.return_value = mock_file_process

test_instance.runProcess()

mock_process_files.assert_called_once
mock_save_records.assert_called_once
mock_commit_changes.assert_called_once
assert mock_process.call_count == 4
assert mock_file_process.start.call_count == 4
assert mock_file_process.join.call_count == 4

def test_process_files(self, test_file_message, mocker):
mock_sleep = mocker.patch('processes.s3Files.sleep')

mock_s3 = mocker.MagicMock()
mock_s3_manager = mocker.patch('processes.s3Files.S3Manager')
mock_s3_manager.return_value = mock_s3

mock_rabbit_mq = mocker.MagicMock()
mock_rabbit_mq_manager = mocker.patch('processes.s3Files.RabbitMQManager')
mock_rabbit_mq_manager.return_value = mock_rabbit_mq
mock_message_propse = mocker.MagicMock()
mock_message_propse.delivery_tag = 'rabbitMQTag'
mock_rabbit_mq.getMessageFromQueue.side_effect = [
(mock_message_propse, {}, test_file_message),
(None, None, None),
(None, None, None),
(None, None, None),
(None, None, None)
]

mock_get_file_contents = mocker.patch.object(S3Process, 'get_file_contents')
mock_get_file_contents.return_value = 'testFileBytes'

mock_generate_webpub = mocker.patch.object(S3Process, 'generate_webpub')
mock_generate_webpub.return_value = 'testWebpubJson'

S3Process.process_files()

assert mock_rabbit_mq.getMessageFromQueue.call_count == 4
mock_rabbit_mq.getMessageFromQueue.assert_called_with('test_file_queue')

mock_sleep.assert_has_calls([
mocker.call(30), mocker.call(60), mocker.call(90)
])

mock_generate_webpub.assert_called_once_with('testBucketPath', 'test_aws_bucket')

mock_s3.putObjectInBucket.assert_has_calls([
mocker.call('testFileBytes', 'testBucketPath.epub', 'test_aws_bucket'),
mocker.call('testWebpubJson', 'testBucketPath/manifest.json', 'test_aws_bucket')
])
mock_rabbit_mq.acknowledgeMessageProcessed.assert_called_once_with('rabbitMQTag')

def test_get_file_contents_success(self, test_instance, mocker):
mock_get_request = mocker.patch.object(requests, 'get')
mock_response = mocker.MagicMock()
mock_response.status_code = 200
mock_response.iter_content.return_value = [b'e', b'p', b'u', b'b']
mock_get_request.return_value = mock_response

test_file = test_instance.get_file_contents('testURL')

assert test_file == b'epub'
mock_get_request.assert_called_once_with(
'testURL',
stream=True,
timeout=15,
headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)'}
)

def test_get_file_contents_error(self, test_instance, mocker):
mock_get_request = mocker.patch.object(requests, 'get')
mock_response = mocker.MagicMock()
mock_response.status_code = 500
mock_get_request.return_value = mock_response

with pytest.raises(Exception):
test_instance.get_file_contents('testURL')

def test_generate_webpub_success(self, mocker):
mock_get_request = mocker.patch.object(requests, 'get')
mock_response = mocker.MagicMock(content='testWebpub')
mock_get_request.return_value = mock_response

test_webpub = S3Process.generate_webpub('testRoot', 'testBucket')

assert test_webpub == 'testWebpub'

mock_get_request.assert_called_once_with(
'test_conversion_url/api/https%3A%2F%2FtestBucket.s3.amazonaws.com%2FtestRoot%2FMETA-INF%2Fcontainer.xml',
timeout=15
)

def test_generate_webpub_error(self, mocker):
mock_get_request = mocker.patch.object(requests, 'get')
mock_response = mocker.MagicMock(content='testWebpub')
mock_response.raise_for_status.side_effect = Exception
mock_get_request.return_value = mock_response

test_webpub = S3Process.generate_webpub('testRoot', 'testBucket')

assert test_webpub == None

mock_get_request.assert_called_once_with(
'test_conversion_url/api/https%3A%2F%2FtestBucket.s3.amazonaws.com%2FtestRoot%2FMETA-INF%2Fcontainer.xml',
timeout=15
)
Loading

0 comments on commit 49c98e4

Please sign in to comment.