Django's bulk_create()

I've done some reading about Django's bulk_create() and I'm currently trying to implement it but it's not functioning as I would expect. Originally my code didn't use bulk_create and it took about 33 seconds or so to import 6074 rows of data. Slow but it worked.

Models:

class Building(models.Model):
    community = models.ForeignKey('Community', related_name='Building Community Name')
    physical_location = models.CharField(max_length=80, null=True, blank=True)
    data_source = models.CharField(max_length=50, null=True, blank=True)
    facility_name = models.CharField(max_length=120, null=True, blank=True)
    facility_type = models.CharField(max_length=80, null=True, blank=True)
    size = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    audited = models.NullBooleanField(blank=True)
    audit_notes = models.TextField(blank=True) 

class RetrofitData(models.Model):
    building_id = models.ForeignKey('Building')
    retrofits_done = models.NullBooleanField(blank=True)
    retrofit_notes = models.TextField(blank=True)
    fuel_oil_preretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    district_heating_oil_usage_preretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    electricity_preretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    natural_gas_preretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    propane_preretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    biomass_preretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    fuel_oil_postretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    district_heating_oil_usage_postretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    electricity_postretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    natural_gas_postretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    propane_postretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    biomass_postretrofit = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    retrofit_cost = models.DecimalField(null=True, blank=True, max_digits=10, decimal_places=2)
    biomass_heat = models.NullBooleanField(blank=True)
    heat_recovery = models.NullBooleanField(blank=True)

Original code:

class BuildingInventoryImporter(dataimport.DataFileImporter):

   def toTrueFalse(self, val):
            if val == "Yes":
                return True
            elif val == "No":
                return False
            else:
                return None

    def decCleaner(self, val):
        if val == '':
            return None
        else:
            return val2dec(val)

    models = [Building, RetrofitData]

    @transaction.commit_manually
    @rollback_on_exception
    def do_import(self):
        book = xlrd.open_workbook(self.data_file.file.path, 
            encoding_override='cp1252')
        sheet = book.sheet_by_index(2)

        for row_index in range(1,sheet.nrows):
            row = sheet.row_values(row_index)

            temp_id= row_index
            community_name = row[0]
            gnis = row[1]
            physical_location = row[2]
            data_source = row[3]
            facility_type = row[5]

            if row[4] == '':
                if facility_type =="Other" or facility_type == "Office" or facility_type == "Public Assembly" or facility_type == "Public Safety":
                    facility_name = "Unavailable"
                elif facility_type =="Health Care - Hospitals":
                    facility_name = community_name + " Clinic"
                elif facility_type == "Education - K - 12":
                    facility_name = community_name + " School(s)"
            else:
                facility_name = row[4]    

            size = self.decCleaner(row[6])
            audited = self.toTrueFalse(row[7])
            audit_notes = row[8]

            building, created = self.get_or_new(Building, id=temp_id)

            try:
                community = Community.objects.get(gnis_feature_id=gnis)
            except Community.DoesNotExist:
                self.warning("The value entered for the Community GNIS: {0} does not exist.".format(row[1]))

                try:
                    community = Community.objects.get(name=community_name)
                    self.warning("The Community name: {0} is in the db but does not match its associated Community GNIS").format(row[0])
                except Community.DoesNotExist:
                    self.warning("Neither the Community name: {0} nor the Community GNIS: {1} exist.".format(row[0], row[1])) 
                    continue

            building.community = community
            building.physical_location = physical_location
            building.data_source = data_source
            building.facility_name = facility_name
            building.facility_type = facility_type
            building.size = size
            building.audited = audited
            building.audit_notes = audit_notes
            building.save()

            retrofit_data, created = self.get_or_new(RetrofitData, building_id=building)

            retrofit_data.retrofits_done =  self.toTrueFalse(row[9])
            retrofit_data.retrofit_notes = row[10]
            retrofit_data.fuel_oil_preretrofit = self.decCleaner(row[11])

            if row[12] == 999999999: #They decided that a unknown value would be represented as 999999999 in the dataset.
                retrofit_data.district_heating_oil_usage_preretrofit = None
            else:
                retrofit_data.district_heating_oil_usage_preretrofit = self.decCleaner(row[12])

            retrofit_data.electricity_preretrofit = self.decCleaner(row[13])
            retrofit_data.natural_gas_preretrofit = self.decCleaner(row[14])
            retrofit_data.propane_preretrofit = self.decCleaner(row[15])
            retrofit_data.biomass_preretrofit = self.decCleaner(row[16])
            retrofit_data.fuel_oil_postretrofit = self.decCleaner(row[17])
            retrofit_data.district_heating_oil_usage_postretrofit = self.decCleaner(row[18])
            retrofit_data.electricity_postretrofit = self.decCleaner(row[19])
            retrofit_data.natural_gas_postretrofit = self.decCleaner(row[20])
            retrofit_data.propane_postretrofit = self.decCleaner(row[21])
            retrofit_data.biomass_postretrofit = self.decCleaner(row[22])
            retrofit_data.retrofit_cost = self.decCleaner(row[23])
            retrofit_data.biomass_heat = self.toTrueFalse(row[24])
            retrofit_data.heat_recovery = self.toTrueFalse(row[25])
            retrofit_data.save()

        if self.dry_run:
            transaction.rollback()
        else:
            transaction.commit()
dataimport.register(BuildingInventoryImporter)

It would have to hit the database like ~1200 times throughout the course of the data import which was causing the slow import. So to remedy this I looked into using the bulk_create()

Modified Code:

class BuildingInventoryImporterV2(dataimport.DataFileImporter):

    models = [Building, RetrofitData]

    def do_import(self, dry_run=True):    
        book = xlrd.open_workbook(self.data_file.file.path, 
            encoding_override='cp1252')
            sheet = book.sheet_by_index(2)

        building_bulk_list = []
        retrofit_bulk_list = [] 

        for row_index in range(1,sheet.nrows):
            row = sheet.row_values(row_index)

            temp_id= row_index
            community_name = row[0]
            gnis = row[1]
            facility_type = row[5]

            try:
                community = Community.objects.get(gnis_feature_id=gnis)
            except Community.DoesNotExist:
                self.warning("The value entered for the Community GNIS: {0} does not exist.".format(row[1]))

                try:
                    community = Community.objects.get(name=community_name)
                    self.warning("The Community name: {0} is in the db but does not match its associated Community GNIS").format(row[0])
                except Community.DoesNotExist:
                    self.warning("Neither the Community name: {0} nor the Community GNIS: {1} exist.".format(row[0], row[1])) 
                    continue

            if row[4] == '':
                if facility_type =="Other" or facility_type == "Office" or facility_type == "Public Assembly" or facility_type == "Public Safety":
                    facility_name = "Unavailable"
                elif facility_type =="Health Care - Hospitals":
                    facility_name = community_name + " Clinic"
                elif facility_type == "Education - K - 12":
                    facility_name = community_name + " School(s)"
            else:
                facility_name = row[4]

            building_to_add = Building(    
                community=community,    
                physical_location=row[2],    
                data_source=row[3],    
                facility_name=facility_name,    
                facility_type=facility_type,    
                size=self.decCleaner(row[6]),    
                audited=self.toTrueFalse(row[7]),    
                audit_notes=row[8]    
            )
            building_bulk_list.append(building_to_add)
        if self.dry_run is False:
            Building.objects.bulk_create(building_bulk_list)

        for row_index in range(1,sheet.nrows):
            row = sheet.row_values(row_index)
            #They decided that a unknown value would be represented as 999999999 in the dataset.

            if row[12] == 999999999:    
                district_heating_oil_usage_preretrofit = None    
            else:    
                district_heating_oil_usage_preretrofit = self.decCleaner(row[12]) 

            retrofit_data_to_add = RetrofitData(    
                building_id=Building.objects.get(id=temp_id),    
                retrofits_done=self.toTrueFalse(row[9]),    
                retrofit_notes=row[10],    
                fuel_oil_preretrofit=self.decCleaner(row[11]),    
                district_heating_oil_usage_preretrofit=district_heating_oil_usage_preretrofit,    
                electricity_preretrofit=self.decCleaner(row[13]),    
                natural_gas_preretrofit=self.decCleaner(row[14]),    
                propane_preretrofit=self.decCleaner(row[15]),    
                biomass_preretrofit=self.decCleaner(row[16]),    
                fuel_oil_postretrofit=self.decCleaner(row[17]),    
                district_heating_oil_usage_postretrofit=self.decCleaner(row[18]),    
                electricity_postretrofit=self.decCleaner(row[19]),    
                natural_gas_postretrofit=self.decCleaner(row[20]),    
                propane_postretrofit=self.decCleaner(row[21]),    
                biomass_postretrofit=self.decCleaner(row[22]),    
                retrofit_cost=self.decCleaner(row[23]),    
                biomass_heat=self.toTrueFalse(row[24]),    
                heat_recovery=self.toTrueFalse(row[25])    
            )    
            retrofit_bulk_list.append(retrofit_data_to_add) 

        if self.dry_run is False:    
            Building.objects.bulk_create(retrofit_bulk_list)    
dataimport.register(BuildingInventoryImporterV2)

Problem arises when I get to the second block of code to bulk import RetroFitData. To my understanding bulk_create() will not assign an AutoField pk when it is called thus you need to place the bulk_create() data in the database before it is assigned AutoField pk. But it seems that that is not accurate as well. After running the import I get the following error:

Traceback:
    File "/home/bhernandez/ISER/virtualenvs/alaskawind/lib/python2.7/site-packages/django/core/handlers/base.py" in get_response
      111.                     response = wrapped_callback(request, *callback_args, **callback_kwargs)
    File "/home/bhernandez/ISER/virtualenvs/alaskawind/lib/python2.7/site-packages/django/utils/decorators.py" in _wrapped_view
      105.                     response = view_func(request, *args, **kwargs)
    File "/home/bhernandez/ISER/virtualenvs/alaskawind/lib/python2.7/site-packages/django/views/decorators/cache.py" in _wrapped_view_func
      52.         response = view_func(request, *args, **kwargs)
    File "/home/bhernandez/ISER/virtualenvs/alaskawind/lib/python2.7/site-packages/django/contrib/admin/sites.py" in inner
      206.             return view(request, *args, **kwargs)
    File "/home/bhernandez/ISER/virtualenvs/alaskawind/lib/python2.7/site-packages/django/contrib/auth/decorators.py" in _wrapped_view
      21.                 return view_func(request, *args, **kwargs)
    File "/home/bhernandez/ISER/aedg/core/adminviews.py" in data_import
      465.                 results = importer.run()
    File "/home/bhernandez/ISER/aedg/core/dataimport/__init__.py" in run
      114.         self.do_import()
    File "/home/bhernandez/ISER/aedg/akw/dataimport/etc.py" in do_import
      656.                 building_id=Building.objects.get(id=temp_id),    
    File "/home/bhernandez/ISER/virtualenvs/alaskawind/lib/python2.7/site-packages/django/db/models/manager.py" in manager_method
      92.                 return getattr(self.get_queryset(), name)(*args, **kwargs)
    File "/home/bhernandez/ISER/virtualenvs/alaskawind/lib/python2.7/site-packages/django/db/models/query.py" in get
      357.                 self.model._meta.object_name)

    Exception Type: DoesNotExist at /admin/core/datafile/174/import/
    Exception Value: Building matching query does not exist.

But when I check my Buildings table it's been populated... Any help or advice is much appreciated.

Answers


So when you come to create RetrofitData you need to know the ID of the Building objects you've just created.

The chances are you're using a database that has the ID field set to autoincrement, and therefore the objects you create using bulk_create won't get a PK assigned.

I suppose using building_bulk_list you could use it's length to get the last set of Building objects from the database, but then why not create the building_bulk_list with a more conventional approach for creating objects, calling save(), allowing you to create a list of IDs?

Then using that list of IDs you can run bulk_create for RetrofitData, iterating over that list of IDs to setup the relationship to Building?


Need Your Help

Eclipse : Target "Unknown" in android device chooser

java android eclipse android-emulator

I am using Samsung Galaxy SIV. When I try to select it in the 'Android device chooser', Eclipse recognize as a Samsung Galaxy SIV but in target it does not recognize. It says target "unknown". Bec...

Getting age automatically when given Date of Birth

javascript jquery html jquery-ui

I wanted to get the age of a person from his date of birth. I have a HTML code where I'm using a datepicker for dob, when I give the date-of-birth it show automatically show the age without giving ...