Skip to content

Commit

Permalink
#93: try to match youtube video on title only when matching with titl…
Browse files Browse the repository at this point in the history
…e+speaker doesn't work well

this can be particularly relevant for talks involving a lot of speakers which are not mentioned
(because list is too long) in youtube video title
  • Loading branch information
fcamblor committed May 1, 2024
1 parent a291d3a commit 3d29018
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 27 deletions.
53 changes: 33 additions & 20 deletions cloud/functions/src/functions/firestore/services/talk-utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -101,31 +101,44 @@ export function findYoutubeMatchingTalks(eventTalks: SimpleTalk[], youtubeVideos
const unmatchedTalks: SimpleTalk[] = [];

for(const talk of eventTalks) {
const talkLowTitle = `${talk.title.toLowerCase()} - ${talk.speakers.map(sp => sp.fullName.toLowerCase()).join(", ")}`
const talkAndSpeakerLowTitle = `${talk.title.toLowerCase()} - ${talk.speakers.map(sp => sp.fullName.toLowerCase()).join(", ")}`
const talkOnlypeakerLowTitle = talk.title.toLowerCase()

const matches = youtubeVideos.map(vid => {
const lowTitle = vid.title.toLowerCase();
const titleSimilarityScore = stringSimilarity(talkLowTitle, lowTitle);
const totalScore = Math.round(titleSimilarityScore*1000)/1000;
const videoLowTitle = vid.title.toLowerCase();
const titleWithSpeakersSimilarityScore = Math.round(stringSimilarity(talkAndSpeakerLowTitle, videoLowTitle)*1000)/1000;
const titleOnlySimilarityScore = Math.round(stringSimilarity(talkOnlypeakerLowTitle, videoLowTitle)*1000)/1000;

return { totalScore, titles: [talkLowTitle, lowTitle], speakers: talk.speakers.map(sp => sp.fullName), video: vid }
return { titleWithSpeakersSimilarityScore, titleOnlySimilarityScore, titles: [talkAndSpeakerLowTitle, videoLowTitle], speakers: talk.speakers.map(sp => sp.fullName), video: vid }
})

matches.sort((m1, m2) => m2.totalScore - m1.totalScore);
const bestMatch = matches[0]

if(bestMatch.totalScore > 0.7) {
matchedTalks.push({ score: bestMatch.totalScore, titles: bestMatch.titles, talk, video: bestMatch.video })
} else if(bestMatch.totalScore > 0.4) {
const candidatesWithAtLeastOneSpeakerFound = matches.filter(m =>
m.totalScore > 0.4
// not matching 1 speaker out of 2 is ok
// but wondering if matching "only" 3 speakers out of 6 is ok...
&& includedSpeakersRatio(m.titles[1], m.speakers) >= 0.5
)
if(candidatesWithAtLeastOneSpeakerFound.length) {
matchedTalks.push({ score: candidatesWithAtLeastOneSpeakerFound[0].totalScore, titles: candidatesWithAtLeastOneSpeakerFound[0].titles, talk, video: candidatesWithAtLeastOneSpeakerFound[0].video })
const matchesSortByTitleWithSpeakersSimilarityScore = [...matches].sort((m1, m2) => m2.titleWithSpeakersSimilarityScore - m1.titleWithSpeakersSimilarityScore);
const bestTitleWithSpeakersMatch = matchesSortByTitleWithSpeakersSimilarityScore[0]

// Matching both title + speaker with a relatively high score
if(bestTitleWithSpeakersMatch.titleWithSpeakersSimilarityScore > 0.7) {
matchedTalks.push({ score: bestTitleWithSpeakersMatch.titleWithSpeakersSimilarityScore, titles: bestTitleWithSpeakersMatch.titles, talk, video: bestTitleWithSpeakersMatch.video })
} else if(bestTitleWithSpeakersMatch.titleWithSpeakersSimilarityScore > 0.4) {
// Trying to find a title-only matching with high fidelity
const matchesSortByTitleOnlySimilarityScore = [...matches].sort((m1, m2) => m2.titleOnlySimilarityScore - m1.titleOnlySimilarityScore);
const bestTitleOnlyMatch = matchesSortByTitleOnlySimilarityScore[0];
if(bestTitleOnlyMatch.titleOnlySimilarityScore > 0.8) {
matchedTalks.push({ score: bestTitleOnlyMatch.titleOnlySimilarityScore, titles: bestTitleOnlyMatch.titles, talk, video: bestTitleOnlyMatch.video })
} else {
unmatchedTalks.push(talk);
// Trying to find talks with similar speakers
const candidatesWithAtLeastOneSpeakerFound = matchesSortByTitleWithSpeakersSimilarityScore.filter(m =>
m.titleWithSpeakersSimilarityScore > 0.4
// not matching 1 speaker out of 2 is ok
// but wondering if matching "only" 3 speakers out of 6 is ok...
&& includedSpeakersRatio(m.titles[1], m.speakers) >= 0.5
)

if(candidatesWithAtLeastOneSpeakerFound.length) {
const bestCandidatesWithGoodSpeakersMatchingRatio = candidatesWithAtLeastOneSpeakerFound[0];
matchedTalks.push({ score: bestCandidatesWithGoodSpeakersMatchingRatio.titleWithSpeakersSimilarityScore, titles: bestCandidatesWithGoodSpeakersMatchingRatio.titles, talk, video: bestCandidatesWithGoodSpeakersMatchingRatio.video })
} else {
unmatchedTalks.push(talk);
}
}
} else {
unmatchedTalks.push(talk);
Expand Down
15 changes: 8 additions & 7 deletions cloud/functions/test-data/vdbuh2024-talks-and-youtube.ts
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,14 @@ export const VDBUH2024_TALKS_AND_YOUTUBE = {
"talkId": "7456",
"videoId": "WAxBGqiWCI0"
},
{
"__score": 0.874,
"__talkTitle": "PANEL: Mastering the AI Shift: Adaptive Strategies for Software Evolution",
"__videoTitle": "[VDBUH2024] - Mastering the AI Shift: Adaptive Strategies for Software Evolution",
"__speakers": "Lucian Gruia, Brian Vermeer, Olimpiu Pop, valentin Tomici, PRATIK PATEL, Cristina Duta, Mihaela CUȚUI",
"talkId": "12602",
"videoId": "OTcNjWA0RZM"
},
{
"__score": 0.878,
"__talkTitle": "Maximizing Security with JDK: Understanding Built-in Capabilities",
Expand Down Expand Up @@ -197,13 +205,6 @@ export const VDBUH2024_TALKS_AND_YOUTUBE = {
}
],
"expectedUnmappedTalks": [
{
"__talkTitle": "PANEL: Mastering the AI Shift: Adaptive Strategies for Software Evolution",
"__talkFormat": "Panel (id=989, duration=PT45m)",
"__talkRoom": "Crowne B (4302)",
"__talkSpeakers": "Lucian Gruia, Brian Vermeer, Olimpiu Pop, valentin Tomici, PRATIK PATEL, Cristina Duta, Mihaela CUȚUI",
"talkId": "12602"
},
{
"__talkTitle": " Fast And Lightweight Spring Boot Applications With GraalVM",
"__talkFormat": "Conference (id=951, duration=PT60m)",
Expand Down

0 comments on commit 3d29018

Please sign in to comment.