Skip to content

Commit

Permalink
Make embedding incremental
Browse files Browse the repository at this point in the history
  • Loading branch information
smolnar committed Sep 27, 2024
1 parent dacbcd1 commit 4ccb28d
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 20 deletions.
2 changes: 1 addition & 1 deletion config/schedule.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
set :output, 'log/cron.log'

every :day, at: '10 pm' do
runner 'ExceptionHandler.run { system("OPENCOURTS_DATABASE_NAME=opencourts_production python3 ml/decree-embeddings/embed.py") }'
runner 'ExceptionHandler.run { system("OPENCOURTS_DATABASE_NAME=opencourts_production INCREMENTAL=true python3 ml/decree-embeddings/embed.py") }'
end

every :day, at: '3:00 am' do
Expand Down
68 changes: 51 additions & 17 deletions ml/decree-embeddings/embed.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,14 @@

if __name__ == "__main__":
try:
model_path = os.path.join(
os.path.dirname(os.path.abspath(__file__)), "models", "reducer.pk"
)
incremental = os.getenv("INCREMENTAL", "false").lower() == "true"

if incremental:
logger.info("Incremental mode enabled")

vocabulary = repository.decrees_vocabulary()

logger.info(f"Loaded vocabulary with [{len(vocabulary)}] values")
Expand All @@ -24,7 +32,7 @@
ids = []
embeddings = []

for batch in repository.decrees():
for batch in repository.decrees(without_embedding_only=incremental):
start_time = time()

vectors = decrees_to_embeddings(vocabulary, batch)
Expand All @@ -38,33 +46,56 @@
ids.extend([decree["id"] for decree in batch])
embeddings.append(vectors)

if len(embeddings) == 0:
logger.info("No embeddings to process")
exit(0)

embeddings = vstack(embeddings)

embeddings_time_in_ms = (time() - embeddings_start_time) * 1000

logger.info(f"Finished embeddings in [{embeddings_time_in_ms:.2f}ms]")

memory = csr_matrix_memory_in_bytes(embeddings)
if incremental:
logger.info(f"Loading reducer model from [{model_path}] ...")

logger.debug(f"Memory usage for embeddings is [{memory / 1024 / 1024:.2f}MB]")
reducer_start_time = time()

sparsity = 1.0 - (
embeddings.count_nonzero() / (embeddings.shape[0] * embeddings.shape[1])
)
eps = math.floor(sparsity / 0.05) * 0.05
with open(model_path, "rb") as f:
reducer = pk.load(f)

logger.info(f"Sparsity of embeddings is [{sparsity}]")
logger.info(f"Setting eps of GaussianRandomProjection to [{eps}]")
reducer_time_in_ms = (time() - reducer_start_time) * 1000

reducer = GaussianRandomProjection(n_components=768, eps=eps)
logger.info(f"Reducer model loaded in [{reducer_time_in_ms:.2f}ms]")
else:
memory = csr_matrix_memory_in_bytes(embeddings)

reducer_start_time = time()
reducer.fit(embeddings)
reducer_time_in_ms = (time() - reducer_start_time) * 1000
logger.debug(
f"Memory usage for embeddings is [{memory / 1024 / 1024:.2f}MB]"
)

logger.info(
f"Finished fitting dimensionality reduction for embeddings in [{reducer_time_in_ms:.2f}ms]"
)
sparsity = 1.0 - (
embeddings.count_nonzero() / (embeddings.shape[0] * embeddings.shape[1])
)
eps = math.floor(sparsity / 0.05) * 0.05

logger.info(f"Sparsity of embeddings is [{sparsity}]")
logger.info(f"Setting eps of GaussianRandomProjection to [{eps}]")

reducer = GaussianRandomProjection(n_components=768, eps=eps)

reducer_start_time = time()
reducer.fit(embeddings)
reducer_time_in_ms = (time() - reducer_start_time) * 1000

logger.info(
f"Finished fitting dimensionality reduction for embeddings in [{reducer_time_in_ms:.2f}ms]"
)

with open(model_path, "wb") as f:
pk.dump(reducer, f)

logger.info(f"Reducer model saved to [{model_path}]")

batch_size = 10_000

Expand All @@ -87,8 +118,11 @@

time_in_ms = (time() - start_time) * 1000

# size min of 100 or embeddings_batch.shape[0]
size = min(100, embeddings_batch.shape[0])

testing_indices = np.random.choice(
embeddings_batch.shape[0], size=100, replace=False
embeddings_batch.shape[0], size=size, replace=False
)

testing_embeddings.append(embeddings_batch[testing_indices, :])
Expand Down
12 changes: 10 additions & 2 deletions ml/decree-embeddings/repository.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,9 @@ def decrees_vocabulary(self):

@retry
@connection
def decrees(self, include_text=True, batch_size=10_000):
def decrees(
self, include_text=True, batch_size=10_000, without_embedding_only=False
):
cur = self._connection.cursor(cursor_factory=psycopg2.extras.RealDictCursor)
last_id = 0

Expand All @@ -101,6 +103,10 @@ def decrees(self, include_text=True, batch_size=10_000):
else ""
)

without_embedding_only_query = (
"AND embedding IS NULL" if without_embedding_only else ""
)

while True:
start_time = time()

Expand Down Expand Up @@ -136,7 +142,9 @@ def decrees(self, include_text=True, batch_size=10_000):
LEFT OUTER JOIN courts ON courts.id = decrees.court_id
LEFT OUTER JOIN court_types ON court_types.id = courts.court_type_id
WHERE decrees.id > %s
WHERE
decrees.id > %s
{without_embedding_only_query}
GROUP BY decrees.id
ORDER BY decrees.id ASC
Expand Down

0 comments on commit 4ccb28d

Please sign in to comment.