FediXbSky.py

'''Class(es) to convert partially processed fedifeed (after sorting and marking)
into a feed of bluesky posts
'''
import os
import shutil
import urllib.request
import tempfile
import json
import pandoc
import numpy
# from  exif import Image

# from PIL import Image # PIL is Pillow
# from pgmagick import gminfo
# from pgmagick import Image as pg_Image
# import cv2 as cv

from  html2text import HTML2Text

from SaxeBlueSkyPython import ticktocktime #import bsky_time_now, tuple_time2unix
from SaxeBlueSkyPython.ticktocktime import bsky_time_now, tuple_time2unix
from SaxeBlueSkyPython import bsky_media_funcs
from SaxeBlueSkyPython.bsky_media_funcs import imageMetadataRemove, image_shrink
from SaxeBlueSkyPython.bsky_media_funcs import CheckImageAspectRatio, CheckImageSize

class BasicBlueskyQueue:
    ''' basic queue holder
    This is a class for a whole queue, not a single entry

    '''

    def __init__(self,sortedmarked_fedi_feed):
        self.raw_fedi_feed = sortedmarked_fedi_feed
        self.first_pass_queue = []
        self.second_pass_queue = []

    def json_raw(self):
        ''' json print raw queue'''
        print (json.dumps(self.raw_fedi_feed))
        return

    def json_firstpassq(self):
        ''' json print first pass queue'''
        print (json.dumps(self.first_pass_queue))
        return

    def json_secondpassq(self):
        ''' json print second pass queue'''
        print (json.dumps(self.second_pass_queue))
        return


    def first_queue_itm_exgest(self,index):
        ''' return element of the first pass queue'''
        i = index
        j = self.first_pass_queue[i]
        return j

    def second_queue_itm_exgest(self,index):
        ''' return element of the first pass queue'''
        i = index
        j = self.second_pass_queue[i]
        return j


    def first_queue_sze (self):
        j = len(self.first_pass_queue)
        return j

    def second_queue_sze (self):
        j = len(self.second_pass_queue)
        return j

    def first_clean(self):
        '''
        Does some basic mapping and clean-up work,
        mapping Mastodon/Fediverse elements to bsky elements
        '''

        for i in self.raw_fedi_feed:
            entry = FirstEntryXwalk(i)
            j = entry.mapper()
            # print ()
            # print ("*********************")
            # print (j)
            # print()
            #print (type(j))

            self.first_pass_queue.append(j)
        return

    def second_clean(self):
        '''
        Polishes text by improving hashtag and link handling
        if needed ()

        * Strip out ugly tag links from body text
        * Append or prepend text with origin link (maybe)
        * append cleaned body text with clean hashtag text
        * check post length
        ** NOTE: will need to move links and tags to rich text
        facets for proper operation

        '''

        for i in self.first_pass_queue:
            # copy Basic Text to cleaned basic text.
            # Stop at first '['
            # Create and work with a second EntryXwalk class


            entry = SecondEntryXwalk(i)
            entry.basic_text_fix()
            entry.body_text_prepend()
            entry.body_text_tag_append()

            # do some stuff

            updated_entry = entry.data_export()
            # cleaned_basic_test = entry.basic_text_fix()
            self.second_pass_queue.append(updated_entry)

        return

class SecondEntryXwalk:
    '''
    class for doing more data cleaning and polishing
     Polishes text by improving hashtag and link handling
        if needed ()

        * Strip out ugly tag links from body text
        * Append or prepend text with origin link (maybe)
        * append cleaned body text with clean hashtag text
        * check post length

    This is a entry class, not a queue class
    '''

    def __init__(self, entry):
        self.entry = entry
        self.egress = {}

    def basic_text_fix(self):
        '''
        Remove ugly URL-ish tags from body text.
        Will replace in a later step with more pithy
        hashtags
        '''
        clean_body_text = ''
        for i in self.entry['basic_text']:
            if i != '[':
                clean_body_text = clean_body_text + i
            elif i== '[':
                break

        clean_body_text = clean_body_text + '\n'
        self.entry['cleaned_basic_text'] = clean_body_text
        return clean_body_text


    def body_text_prepend(self):
        '''
        pre-pend original post url
        '''
        post_url = self.entry.get('post_link') # might be absent
        if post_url is None:
            post_url = ' '
        statement = '<via: ' + post_url + '>'
        self.entry['cleaned_basic_text'] = \
            statement + '\n' +  self.entry['cleaned_basic_text']

    def body_text_tag_append(self):
        tags = self.entry['tags']
        # print (json.dumps(tags))
        for i in tags:
            # it is possible the tags come in two different forms
            if "term" in tags:
                self.entry['cleaned_basic_text'] = \
                    self.entry['cleaned_basic_text'] + \
                    ' ' + '#' + i['term']
            elif "term" not in tags:
                self.entry['cleaned_basic_text'] = \
                    self.entry['cleaned_basic_text']

        return


    def json_entry(self):
        ''' Diagnostic printing using json'''
        print (json.dumps(self.entry))
        return

    def data_export (self):
        ''' Export data to post builders '''
        return self.entry # test

class FirstEntryXwalk:
    '''class for working on initial fediverse to
    bluesky entry crosswalk.
    This is a class for an entry, not a queue

    # Need to add original link handling
    '''

    def __init__(self, entry):
        self.entry = entry
        self.exit = {}

    def json_entry(self):
        ''' Diagnostic printing using json'''
        print (json.dumps(self.entry))
        return

    def mapper(self):
        '''Main mapper function'''

        concat = {}
        bsc = entryAddBasic(self.entry)
        rts = entryAddRatings(self.entry)
        # tagging = entryFixTags(self.entry)


        if entry_cw_check(self.entry)==True:
            concat.update(bsc)
            concat.update(rts)
            # concat.update(tagging)
            cws = entry_cw_split(self.entry)
            concat.update(cws)

        elif entry_cw_check(self.entry)==False:
            concat.update(bsc)
            concat.update(rts)
            # concat.update(tagging)
            cws = {'content_warn':''}
            concat.update(cws)

        check_sensitive = entryCheckSensitive(concat)
        concat.update(check_sensitive)

        is_media = MediaContentCheck(self.entry)
        if is_media == bool (True):
            has_media = {'has_media': bool(True)}
            concat.update (has_media)

            # Take out vid gif check here and put
            # That work in bsky_mediaMash
            
            # vid_gif=MediaTypeCheck(self.entry)
            # print ("Vid Gif: ",vid_gif)
            
            #if 1 not in vid_gif:

            media_added = MediaContentAttach(self.entry)
            # print(json.dumps(self.entry))
            concat.update(media_added)
            # Check for alt text per media
            alt_text_v=media_altText_fetch(self.entry)
            post_alt_text = {'alt_text_vector':[alt_text_v]}
            # print (json.dumps(post_alt_text))
            concat.update (post_alt_text)
                
            # elif 1 in vid_gif:
                # do_nothing = bool(True)
                # has_media = {'has_media': bool(False)}

        elif is_media == bool(False):
            has_media = {'has_media': bool(False)}

        concat.update (has_media)
        self.exit = concat

        return self.exit

    def data_export (self):
        ''' Export data to post builders '''
        return

# End Classess

# Start Functions

def export_basic(entry):

    return


def MediaContentCheck (element):

    if element.get('media_content') == None:
        got_media = bool (False)
    elif element.get('media_content') != None:
        got_media = bool (True)

    return got_media

def MediaTypeCheck (element):
    '''
    Check if video or gif, which are not currently supported in
    bluesky (5/13/2024)

    Simplistic approach; if any media element of the post are problematic,
    this routine flags all as problematic
    '''
    
    has_vid=0
    has_gif=0

    print (json.dumps(element))
    numContent = len (element['media_content'])

    for i in range (0, numContent):
        mc = element['media_content'][i].get('type')
        mdum = element['media_content'][i].get('medium')
    
        if 'video' in mc:
            has_vid=1
        elif 'gif' in mc:
            has_gif=1
        elif 'GIF' in mc:
            has_gif=1
        elif 'video' in mdum:
            has_vid=1
        elif mdum != 'image':
            has_vid=1
            
    return (has_vid,has_gif)


def MediaContentAttach (medElem):
    '''For each element entry,determine if media is present and handle it.'''
    ''' Needs to handle a media sets from 1 to ...'''
    ''' Check for presense of media is needed before invoking this'''
    ''' is working on the middle queue'''
    numContent = len (medElem['media_content'])
    medContent = medElem['media_content']
    stru2ret = {'SetMediaElements':''}
    innrLst = []
    innrDict = {}

    #print (numContent)
    for i in range (0, numContent):
        mediaInfo = MediaContentHandle (medContent[i])
        innrDict.update (mediaInfo)
        note = {'Entry':i+1, 'Of':numContent}
        innrDict.update (note)
        innrLst.append(innrDict)
    stru2ret['SetMediaElements'] =innrLst
    medElem.update(stru2ret)
    # print (medElem)
    return medElem

def media_altText_fetch (medElem):
    # print (medElem)
    if 'content' in medElem:
        potn_atext = len (medElem['content'])
        alt_text_entries = []
        for i in range (potn_atext):
            alt_text = medElem['content'][i]['value']
            alt_text_entries.append(alt_text)

        return alt_text_entries

    if 'content' not in medElem:
        alt_text_entries = ''
        return alt_text_entries


def MediaContentHandle(MediaContentSet):
    # x is revised media material
    y = MediaContentSet

    x = {
        'media_url': y['url'],
        'media_type': y['type'],
        'media_size_stated': y['filesize'],
        'media_size_calculated': 'reserved',
        'medium_type': y['medium'],
        'localFilePath': 'reserved',
        'aspectRatio' : {
            'height': 'reserved',
            'width': 'reserved',
            'colorChannels': 'reserved',
            }
    }

    filename = remoteImageGet(x['media_url'])
    # Remove exif
    # exif_free_filename=imageMetadataRemove(filename)
    # shrinked_filename= image_shrink(exif_free_filename)
    # Reduce size
    # x['localFilePath'] = shrinked_filename

    # Basic attachment; further fixes in bluseky media mash

    
    x['localFilePath'] = filename

    cmpFileSize = CheckImageSize(filename)
    x['media_size_calculated'] = cmpFileSize
    # Get aspect ratio
    # cmpShapeInfo = CheckImageAspectRatio(filename)
    # x['aspectRatio']['height'] = cmpShapeInfo[0]
    # x['aspectRatio']['width'] = cmpShapeInfo[1]
    # x['aspectRatio']['colorChannels'] = cmpShapeInfo[2]


    # these will be populated later in bsky media mash

    x['aspectRatio']['height'] = 0
    x['aspectRatio']['width'] = 0
    x['aspectRatio']['colorChannels'] = 0

    
    return x

def entryAddBasic(rawpost):
    ''' Starting process of transforming fedi post to bsky post
    does the super basic, should always be there fields.
    NB: current use of html2text needs improvement.
    First cut: use option to remove links
    Second cut: use pandoc
    Third idea: combine html2text and pandoc
    Or use strip/replace from python libs
    '''

    h = HTML2Text()
    h.ignore_emphsis = True
    h.single_line_break = True
    h.inline_links = False
    h.skip_internal_links = True
    h.re_space = True

    e = rawpost

    # origTxt = e['summary']

    g = {
        'orig_text': 'html_laden_string',
        'basic_text': 'string',
        'cleaned_basic_text':'string',
        'lang_of_post': 'string',
        'post_link': 'URL of post',
        'base_url': 'base location of feed',
        'published_parsed': 'published parsed python',
        'orig_post_time': 'convert to unixtime',
        'SetMediaElements': [], #placeholder for media if present'
        }

    g['orig_text'] = e['summary']

    # do fix tags
    # print (json.dumps (e['summary_detail']['value']))
    g.update (entryFixTags (e))

    # About here need to remove hashtag links from main text body
    #
    tmp = h.handle(e['summary']) # remove html
    tmp2 = tmp.strip() # remove new lines and end using strip
    g['basic_text'] = tmp2.replace('\n',' ') # remove new lines inside string, replace with ' '

    g['lang_of_post'] = e['summary_detail']['language'] # as of feb 2024, usually = none
    g['base_url'] = e ['summary_detail']['base']
    g['published_parsed'] = e['published_parsed']
    g['orig_post_time'] = tuple_time2unix (e['published_parsed'])
    g['post_link'] = e['link']

    return g

def entryAddRatings (rawpost):
    #  These elements are conditional; need to check and handle absent/present'''
    e = rawpost
    MRat = {}

    r = {'rating': ''}

    mr = {'media_rating':''}


    if e.get('media_rating') != None:
        mr['media_rating'] = e['media_rating']['content']
        MRat.update(mr)

    elif e.get('media_rating') == None:
        mr['media_rating'] = 'nonadult'
        MRat.update(mr)

    if e.get('rating') != None:
        r['rating'] = e['rating']
        MRat.update (r)
    elif e.get('rating') == None:
        r['rating'] = 'nonadult'
        MRat.update (r)

    return MRat

def find_tags (long_string):
    # Count how many '#' are in the string
    starts = 0
    ends = 0

    tags = []
    # print ("long string: ", long_string)
    start_string = 0
    end_string = 0
    t=0
    for i in long_string:
        if i == '#':
            start_string = t
            # print ("startstring", start_string)
            t = t+1
        elif i == ']':
            stop_string = t
            # print ("stopstring", stop_string)
            tmptag = long_string[start_string:stop_string]
            t = t+1
            tags.append (tmptag)
        else:
            t = t+1

    # print ('tags:',tags)

    return tags


def entryFixTags (rawentry):
    '''mzonePost is output from entryAddBasic'''
    ''' take tags and write fixed_tags'''
    h = HTML2Text()
    e = rawentry
    sd = e['summary_detail']['value']
    tmp = h.handle(sd)


    if '#' in tmp:
        taglist = find_tags (tmp)
        t = {'tags':taglist}


    else:
        t = {'tags':''}

    return t

def entry_cw_check (rawpost): # Change this to rawpost
    '''To deal with mastodon/fediverse munging of content warning and
    main content
    ***NEEDS WORK *** check for html string '<hr />'
    indicates text left of it is a content warning
    '''
    rps= rawpost['summary']

    if "<hr />" in rps:
        return bool(True)
    else:
        return bool(False)

def entry_cw_split (rawpost):
    '''Split the CW off from the main text and return the CW
    CW = content warning
    We have already checked for the sign of a content warning.
    Also strips out html codes and newlines from content warning
    '''
    h = HTML2Text()
    h.ignore_emphsis = True
    h.single_line_break = True
    rps = rawpost['summary']

    hrLoc = rps.find('<hr />')
    cw_html ={'content_warn': rps[0:hrLoc]}
    cw_mkdwn_tmp  = h.handle(rps[0:hrLoc])
    cw_tmp = cw_mkdwn_tmp.strip()

    cw_plain = {'content_warn': cw_tmp}

    return cw_plain


def entryCheckSensitive(cleanerPost):

    f = cleanerPost
    isSensitive = bool (False)

    if f['media_rating'] != 'nonadult':
        isSensitive = bool (True)

    if f['rating'] != 'nonadult':
        isSensitive = bool (True)

    if f['content_warn'] != '':
        isSensitive = bool (True)

    spost = {
        'sensitive_post': isSensitive
    }

    return spost

def remoteImageGet(url):
    '''
    Fetch actual images from fediverse posts to place into outgoing
    '''

    with urllib.request.urlopen(url,data=None) as response:
        with tempfile.NamedTemporaryFile(delete=False) as tmp_filename:
            shutil.copyfileobj(response, tmp_filename)
    # print ("temp file name: ", tmp_filename.name)
    return tmp_filename.name