Fix file hashing

This commit is contained in:
jeremystretch 2023-01-26 14:47:52 -05:00
parent bfb21f62de
commit 9f299b7b42
4 changed files with 15 additions and 12 deletions

View File

@ -15,4 +15,4 @@ class DataSourceAdmin(admin.ModelAdmin):
@admin.register(DataFile) @admin.register(DataFile)
class DataFileAdmin(admin.ModelAdmin): class DataFileAdmin(admin.ModelAdmin):
list_display = ('path', 'size', 'last_updated') list_display = ('path', 'size', 'last_updated')
readonly_fields = ('source', 'path', 'last_updated', 'size', 'checksum') readonly_fields = ('source', 'path', 'last_updated', 'size', 'hash')

View File

@ -1,4 +1,4 @@
# Generated by Django 4.1.5 on 2023-01-26 19:00 # Generated by Django 4.1.5 on 2023-01-26 19:46
from django.db import migrations, models from django.db import migrations, models
import django.db.models.deletion import django.db.models.deletion
@ -37,7 +37,7 @@ class Migration(migrations.Migration):
('path', models.CharField(editable=False, max_length=1000, unique=True)), ('path', models.CharField(editable=False, max_length=1000, unique=True)),
('last_updated', models.DateTimeField(editable=False)), ('last_updated', models.DateTimeField(editable=False)),
('size', models.PositiveIntegerField(editable=False)), ('size', models.PositiveIntegerField(editable=False)),
('checksum', models.CharField(editable=False, max_length=64)), ('hash', models.CharField(editable=False, max_length=64)),
('data', models.BinaryField()), ('data', models.BinaryField()),
('source', models.ForeignKey(editable=False, on_delete=django.db.models.deletion.CASCADE, related_name='datafiles', to='core.datasource')), ('source', models.ForeignKey(editable=False, on_delete=django.db.models.deletion.CASCADE, related_name='datafiles', to='core.datasource')),
], ],

View File

@ -12,7 +12,7 @@ from django.urls import reverse
from django.utils import timezone from django.utils import timezone
from django.utils.translation import gettext as _ from django.utils.translation import gettext as _
from utilities.files import sha256_checksum from utilities.files import sha256_hash
from .choices import * from .choices import *
__all__ = ( __all__ = (
@ -99,7 +99,7 @@ class DataSource(models.Model):
continue continue
# Bulk update modified files # Bulk update modified files
updated_count = DataFile.objects.bulk_update(updated_files, ['checksum']) updated_count = DataFile.objects.bulk_update(updated_files, ['hash'])
logger.debug(f"Updated {updated_count} data files") logger.debug(f"Updated {updated_count} data files")
# Bulk delete deleted files # Bulk delete deleted files
@ -207,7 +207,7 @@ class DataFile(models.Model):
editable=False editable=False
) )
# TODO: Create a proper SHA256 field # TODO: Create a proper SHA256 field
checksum = models.CharField( hash = models.CharField(
max_length=64, max_length=64,
editable=False editable=False
) )
@ -237,14 +237,14 @@ class DataFile(models.Model):
# Get attributes from file on disk # Get attributes from file on disk
file_size = os.path.getsize(file_path) file_size = os.path.getsize(file_path)
file_checksum = sha256_checksum(file_path).hexdigest() file_hash = sha256_hash(file_path).hexdigest()
# Update instance file attributes & data # Update instance file attributes & data
has_changed = file_size != self.size or file_checksum != self.checksum has_changed = file_size != self.size or file_hash != self.hash
if has_changed: if has_changed:
self.last_updated = timezone.now() self.last_updated = timezone.now()
self.size = file_size self.size = file_size
self.checksum = file_checksum self.hash = file_hash
with open(file_path, 'rb') as f: with open(file_path, 'rb') as f:
self.data = f.read() self.data = f.read()

View File

@ -1,6 +1,9 @@
import hashlib import hashlib
def sha256_checksum(filepath): def sha256_hash(filepath):
# TODO: Write an actual checksum function """
return hashlib.sha256(filepath.encode('utf-8')) Return the SHA256 hash of the file at the specified path.
"""
with open(filepath, 'rb') as f:
return hashlib.sha256(f.read())