Add git sync

This commit is contained in:
jeremystretch 2023-01-26 14:03:08 -05:00
parent 6d5e33b785
commit bfb21f62de
3 changed files with 59 additions and 21 deletions

View File

@ -14,5 +14,5 @@ class DataSourceAdmin(admin.ModelAdmin):
@admin.register(DataFile) @admin.register(DataFile)
class DataFileAdmin(admin.ModelAdmin): class DataFileAdmin(admin.ModelAdmin):
list_display = ('path', 'size') list_display = ('path', 'size', 'last_updated')
readonly_fields = ('source', 'path', 'last_updated', 'size', 'checksum') readonly_fields = ('source', 'path', 'last_updated', 'size', 'checksum')

View File

@ -1,4 +1,4 @@
# Generated by Django 4.1.5 on 2023-01-24 18:56 # Generated by Django 4.1.5 on 2023-01-26 19:00
from django.db import migrations, models from django.db import migrations, models
import django.db.models.deletion import django.db.models.deletion
@ -22,6 +22,9 @@ class Migration(migrations.Migration):
('description', models.CharField(blank=True, max_length=200)), ('description', models.CharField(blank=True, max_length=200)),
('url', models.CharField(max_length=200)), ('url', models.CharField(max_length=200)),
('ignore_rules', models.TextField(blank=True)), ('ignore_rules', models.TextField(blank=True)),
('username', models.CharField(blank=True, max_length=100)),
('password', models.CharField(blank=True, max_length=100)),
('git_branch', models.CharField(blank=True, max_length=100)),
], ],
options={ options={
'ordering': ('name',), 'ordering': ('name',),

View File

@ -1,8 +1,10 @@
import logging import logging
import os import os
import subprocess
import tempfile
from functools import cached_property from functools import cached_property
from fnmatch import fnmatchcase from fnmatch import fnmatchcase
from urllib.parse import urlparse from urllib.parse import quote, urlunparse, urlparse
from django.conf import settings from django.conf import settings
from django.db import models from django.db import models
@ -49,12 +51,24 @@ class DataSource(models.Model):
blank=True, blank=True,
help_text=_("Patterns (one per line) matching files to ignore when syncing") help_text=_("Patterns (one per line) matching files to ignore when syncing")
) )
username = models.CharField(
max_length=100,
blank=True
)
password = models.CharField(
max_length=100,
blank=True
)
git_branch = models.CharField(
max_length=100,
blank=True
)
class Meta: class Meta:
ordering = ('name',) ordering = ('name',)
def __str__(self): def __str__(self):
return self.name return f'{self.name} ({self.get_type_display()})'
# def get_absolute_url(self): # def get_absolute_url(self):
# return reverse('core:datasource', args=[self.pk]) # return reverse('core:datasource', args=[self.pk])
@ -64,9 +78,10 @@ class DataSource(models.Model):
Create/update/delete child DataFiles as necessary to synchronize with the remote source. Create/update/delete child DataFiles as necessary to synchronize with the remote source.
""" """
# Replicate source data locally (if needed) # Replicate source data locally (if needed)
source_root = self.fetch() temp_dir = tempfile.TemporaryDirectory()
logger.debug(f'Syncing files from source root {source_root}') self.fetch(path=temp_dir.name)
print(f'Syncing files from source root {temp_dir.name}')
data_files = self.datafiles.all() data_files = self.datafiles.all()
known_paths = {df.path for df in data_files} known_paths = {df.path for df in data_files}
@ -76,7 +91,7 @@ class DataSource(models.Model):
for datafile in data_files: for datafile in data_files:
try: try:
if datafile.refresh_from_disk(root_path=source_root): if datafile.refresh_from_disk(source_root=temp_dir.name):
updated_files.append(datafile) updated_files.append(datafile)
except FileNotFoundError: except FileNotFoundError:
# File no longer exists # File no longer exists
@ -92,32 +107,53 @@ class DataSource(models.Model):
logger.debug(f"Deleted {updated_count} data files") logger.debug(f"Deleted {updated_count} data files")
# Walk the local replication to find new files # Walk the local replication to find new files
new_paths = self._walk(source_root) - known_paths new_paths = self._walk(temp_dir.name) - known_paths
# Bulk create new files # Bulk create new files
new_datafiles = [] new_datafiles = []
for path in new_paths: for path in new_paths:
datafile = DataFile(source=self, path=path) datafile = DataFile(source=self, path=path)
datafile.refresh_from_disk(root_path=source_root) datafile.refresh_from_disk(source_root=temp_dir.name)
new_datafiles.append(datafile) new_datafiles.append(datafile)
# TODO: Record last_updated? # TODO: Record last_updated?
created_count = len(DataFile.objects.bulk_create(new_datafiles, batch_size=100)) created_count = len(DataFile.objects.bulk_create(new_datafiles, batch_size=100))
logger.debug(f"Created {created_count} data files") logger.debug(f"Created {created_count} data files")
def fetch(self): temp_dir.cleanup()
def fetch(self, path):
""" """
Replicate the file structure from the remote data source and return the local path. Replicate the file structure from the remote data source and return the local path.
""" """
logger.debug(f"Fetching source data for {self}") logger.debug(f"Fetching source data for {self} ({self.get_type_display()})")
try:
if self.type == DataSourceTypeChoices.LOCAL: fetch_method = getattr(self, f'fetch_{self.type}')
logger.debug(f"Data source type is local; skipping fetch") except AttributeError:
# No replication is necessary for local sources
return urlparse(self.url).path
raise NotImplemented(f"fetch() not yet supported for {self.get_type_display()} data sources") raise NotImplemented(f"fetch() not yet supported for {self.get_type_display()} data sources")
# TODO: Sync remote files to tempfile.TemporaryDirectory return fetch_method(path)
def fetch_local(self, path):
"""
Skip fetching for local paths; return the source path directly.
"""
logger.debug(f"Data source type is local; skipping fetch")
return urlparse(self.url).path
def fetch_git(self, path):
"""
Perform a shallow clone of the remote repository using the `git` executable.
"""
# Add authentication credentials to URL (if specified)
if self.username and self.password:
url_components = list(urlparse(self.url))
# Prepend username & password to netloc
url_components[1] = quote(f'{self.username}@{self.password}:') + url_components[1]
url = urlunparse(url_components)
else:
url = self.url
result = subprocess.run(['git', 'clone', '--depth', '1', url, path])
def _walk(self, root_path): def _walk(self, root_path):
""" """
@ -126,8 +162,7 @@ class DataSource(models.Model):
paths = set() paths = set()
for path, dir_names, file_names in os.walk(root_path): for path, dir_names, file_names in os.walk(root_path):
path = path.split(root_path)[1] # Strip root path path = path.split(root_path)[1].lstrip('/') # Strip root path
path.lstrip('/')
if path.startswith('.'): if path.startswith('.'):
continue continue
for file_name in file_names: for file_name in file_names:
@ -193,12 +228,12 @@ class DataFile(models.Model):
# def get_absolute_url(self): # def get_absolute_url(self):
# return reverse('core:datafile', args=[self.pk]) # return reverse('core:datafile', args=[self.pk])
def refresh_from_disk(self, root_path): def refresh_from_disk(self, source_root):
""" """
Update instance attributes from the file on disk. Returns True if any attribute Update instance attributes from the file on disk. Returns True if any attribute
has changed. has changed.
""" """
file_path = os.path.join(root_path, self.path) file_path = os.path.join(source_root, self.path)
# Get attributes from file on disk # Get attributes from file on disk
file_size = os.path.getsize(file_path) file_size = os.path.getsize(file_path)