Hintergrund

Seit Juni 2014 melden Tankstellen in Deutschland ihre aktuellen Kraftstoffpreise an die Markttransparenzstelle für Kraftstoffe (MTS-K), eine Abteilung des Bundeskartellamtes. Die rechtliche Verpflichtung zur Meldung der Preisinformationen bietet eine seltene Möglichkeit die Entwicklung von Preisen zeitnah flächendeckend zu analysieren. Datenquelle für die Kraftstoffpreise und Tankstellen Informationen ist Tankerkönig.

  # packages
  library(tidyverse)
  library(ggthemes)
  library(gridExtra)
  library(tmap)
  library(leaflet)
  library(leaflet.extras)
  library(rgdal)
  library(DBI)
  library(odbc)
  
  # themes
  source(file = "code/functions/theme_base_FUN.R", echo = F, encoding = "UTF-8")
  
  # plot elements
  colors = c("#FF8F00", "#5A8F29", "#3C7DC4", "#2B2B2B")
  
  # DB connection
  dbcon <- dbConnect(odbc(), "fuelprice", timeout = 10, encoding = "windows-1252", bigint = "integer")

Tankstellen

Für die Übersicht der in Deutschland aktiven Tankstellen, wurden die Daten sämtlicher Tankstellen ausgewertet, welche im Beobachtungszeitraum (01.01. bis 31.12.) mindestens eine Preisinformation übermittelt haben.

nach Bundesländern

Die räumliche Zuordnung der Tankstellen erfolgt auf Basis der im Datensatz enthalten Angaben zu Längen- und Breitengrad einzelner Tankstellen.

  # query active stations in year
  year <- 2017
  query_stations_year <- paste0("
                    SELECT 
                      distinct(stid)
                    FROM 
                      price
                    WHERE 
                      time BETWEEN '", year,"-01-01 00:00:01'
                           AND '", year,"-12-31 23:59:59'
                    GROUP BY
                      stid;")

  act_stations <- dbGetQuery(dbcon, query_stations_year) %>%
    as_tibble()
  
  act_stations_string <- paste(act_stations$stid, collapse = ",")
  
  # Query stations data
  query_states <- paste0("
                    SELECT
                      stations.bl_kz,
                      stations.bl_name,
                      count,
                      area.area_bl,
                      pop.pop_bl,
                      round(count/area.area_bl,4) AS a_dens,
                      round(count*1000/(pop.pop_bl),4) AS p_dens
                    FROM 
                      (SELECT
                          bl_kz,
                          bl_name,
                          count(*) AS count 
                       FROM
                          stations
                       WHERE
                          bl_kz IS NOT NULL
                       AND
                          uuid = ANY('{",act_stations_string ,"}')
                       GROUP BY bl_kz, bl_name) stations
                    LEFT JOIN 
                      (SELECT
                          bl_kz,
                          area_bl
                       FROM 
                          area_bl
                         WHERE
                            date = '2016-12-31') area
                    ON
                      stations.bl_kz = area.bl_kz
                    LEFT JOIN
                      (SELECT
                          bl_kz,
                          pop_bl
                       FROM
                          pop_bl
                         WHERE
                            date = '", year, "-12-31') pop
                    ON
                      stations.bl_kz = pop.bl_kz
                    ORDER BY
                      bl_kz ASC;")
  
  stations_states <- dbGetQuery(dbcon, query_states) %>% 
    as_tibble() %>%
    mutate(bl_name = factor(bl_name, bl_name))
  
  # plot number of stations by state
  stations_states_plot <- ggplot(data = stations_states, aes(x = bl_name, y = count)) + 
    geom_bar(stat = "identity",
             show.legend = F,
             width = 0.9,
             fill = colors[3],
             alpha = 1) +
    #geom_text(aes(label = count), # add for printing actual numbers to bars
    #          hjust = if_else(stations_states$count >= 0.7*max(stations_states$count), 1.2, -0.1), 
    #          color = if_else(stations_states$count >= 0.7*max(stations_states$count), "white", "black"), 
    #          position = position_dodge(0.9), size = 3.5) +
    theme_base() +
    labs(x = NULL, y = "Anzahl Tankstellen") + 
    coord_flip()
  
  # plot number of stations per square kilometer by state
  area_states_plot <- ggplot(data = stations_states, mapping = aes(x = bl_name, y = a_dens)) + 
    geom_bar(stat = "identity",
             show.legend = F,
             width = 0.9,
             fill = colors[1],
             alpha = 1) +
    #geom_text(aes(label = round(a_dens, 2)), # add for printing actual numbers to bars
    #          hjust = if_else(stations_states$a_dens >= 0.7*max(stations_states$a_dens), 1.2, -0.1), 
    #          color = "black",
    #          position = position_dodge(0.9), size = 3.5) +
    theme_base() +
    labs(x = NULL, y = "... pro Quadratkilometer") + 
    coord_flip() + 
    scale_x_discrete(labels = rep("",dim(stations_states)[1]))
  
  # plot number of stations per population by state
  pop_states_plot <- ggplot(data = stations_states, mapping = aes(x = bl_name, y = p_dens)) + 
    geom_bar(stat = "identity",
             show.legend = F,
             width = 0.9,
             fill = colors[2],
             alpha = 1) +
    #geom_text(aes(label = round(p_dens, 2)), # add for printing actual numbers to bars
    #          hjust = if_else(stations_states$p_dens >= 0.7*max(stations_states$p_dens), 1.2, -0.1), 
    #          color = if_else(stations_states$p_dens >= 0.7*max(stations_states$p_dens), "white", "black"), 
    #          position = position_dodge(0.9), size = 3.5) +
    theme_base() +
    labs(x = NULL, y = "... pro 1.000 Einwohner") + 
    coord_flip() +
    scale_x_discrete(labels = rep("",dim(stations_states)[1]))
  
  # arrange plots
  grid.arrange(stations_states_plot, area_states_plot, pop_states_plot,
               ncol = 3,
               widths = c(4.5,3,3))

in den 7 größten Städten

  # query data
  query_city7 <- paste0("
                    SELECT
                      pop.kreis_kz,
                      stations.kreis_name,
                      count, 
                      pop.pop_kreis,
                      area.area_kreis,
                      round(count/area.area_kreis, 4) AS a_dens,
                      round(count*1000/pop.pop_kreis, 4) AS p_dens
                    FROM 
                      (SELECT
                          kreis_kz,
                          pop_kreis 
                       FROM
                          pop_kreis
                       WHERE
                          date = '", year, "-12-31'
                       AND
                          pop_kreis IS NOT NULL
                      ORDER BY
                          pop_kreis DESC
                      LIMIT 7) pop
                    LEFT JOIN
                      (SELECT
                          kreis_kz,
                          kreis_name,
                          count(*) AS count 
                       FROM
                          stations
                       WHERE
                          uuid = ANY('{",act_stations_string ,"}')
                       GROUP BY
                          kreis_kz,
                          kreis_name) stations
                    ON 
                      pop.kreis_kz = stations.kreis_kz
                    LEFT JOIN
                      (SELECT
                          kreis_kz,
                          area_kreis 
                      FROM
                          area_kreis
                      WHERE
                          date = '2016-12-31') area
                    ON
                      pop.kreis_kz = area.kreis_kz
                    ORDER BY
                      pop.kreis_kz ASC;")

  stations_city7 <- dbGetQuery(dbcon, query_city7) %>%
    as_tibble() %>%
    mutate(kreis_name = factor(kreis_name, kreis_name))
  
  # plot number of stations by city
  stations_city7_plot <- ggplot(data = stations_city7, mapping = aes(x = kreis_name, y = count)) + 
    geom_bar(stat = "identity",
             show.legend = F,
             width = 0.9,
             fill = colors[3],
             alpha = 1) +
    #geom_text(aes(label = count), # add for printing actual numbers to bars
    #          hjust = if_else(stations_city7$count >= 0.7*max(stations_city7$count), 1.2, -0.1), 
    #          color = if_else(stations_city7$count >= 0.7*max(stations_city7$count), "white", "black"), 
    #          position = position_dodge(0.9), size = 3.5) +
    theme_base() +
    labs(x = NULL, y = "Anzahl Tankstellen") + 
    coord_flip()
  
  # plot number of stations per square kilometer by city
  area_city7_plot <- ggplot(data = stations_city7, mapping = aes(x = kreis_name, y = a_dens)) + 
    geom_bar(stat = "identity",
             show.legend = F,
             width = 0.9,
             fill = colors[1],
             alpha = 1) +
    #geom_text(aes(label = round(a_dens, 2)), # add for printing actual numbers to bars
    #          hjust = if_else(stations_city7$a_dens >= 0.7*max(stations_city7$a_dens), 1.2, -0.1), 
    #          color = "black",
    #          position = position_dodge(0.9), size = 3.5) +
    theme_base() +
    labs(x = NULL, y = "... pro Quadratkilometer") + 
    coord_flip() + 
    scale_x_discrete(labels = rep("",dim(stations_city7)[1]))
  
  # plot number of stations per population by city
  pop_city7_plot <- ggplot(data = stations_city7, mapping = aes(x = kreis_name, y = p_dens)) + 
    geom_bar(stat = "identity",
             show.legend = F,
             width = 0.9,
             fill = colors[2],
             alpha = 1) +
    #geom_text(aes(label = round(p_dens, 2)), # add for printing actual numbers to bars
    #          hjust = if_else(stations_city7$p_dens >= 0.7*max(stations_city7$p_dens), 1.2, -0.1), 
    #          color = if_else(stations_city7$p_dens >= 0.7*max(stations_city7$p_dens), "white", "black"), 
    #          position = position_dodge(0.9), size = 3.5) +
    theme_base() +
    labs(x = NULL, y = "... pro 1.000 Einwohner") + 
    coord_flip() +
    scale_x_discrete(labels = rep("",dim(stations_city7)[1]))
  
  # arrange plots
  grid.arrange(stations_city7_plot, area_city7_plot, pop_city7_plot,
               ncol = 3,
               widths = c(4,3,3))

nach Landkreisen

  # load map county map data
  county_shp_in <- readOGR(dsn = "data/map2019/vg250_ebenen", 
                           layer="VG250_KRS", encoding = "UTF-8",
                           stringsAsFactors = F, verbose = F, use_iconv = T)
  
  # query county data
  query_county <- paste0("
                    SELECT
                      stations.kreis_kz,
                      stations.kreis_name,
                      count,
                      area.area_kreis,
                      pop.pop_kreis,
                            cars.kfz,
                            cars.lkw,
                      round(count/area.area_kreis, 4) AS a_dens, 
                      round(count*1000/pop.pop_kreis, 4) AS p_dens,
                            round(count*1000/cars.kfz::decimal, 4) AS kfz_dens,
                            round(count*1000/cars.lkw::decimal, 4) AS lkw_dens
                    FROM 
                      (SELECT
                          kreis_kz,
                          kreis_name,
                          count(*) AS count
                       FROM
                          stations_test
                       WHERE
                          kreis_kz IS NOT NULL
                       AND
                          uuid = ANY('{",act_stations_string ,"}')
                       GROUP BY
                          kreis_kz,
                          kreis_name) stations
                    LEFT JOIN
                      (SELECT
                          kreis_kz,
                          area_kreis
                       FROM
                          area_kreis
                         WHERE
                            date = '2016-12-31') area
                    ON
                      stations.kreis_kz = area.kreis_kz
                    LEFT JOIN
                      (SELECT
                          kreis_kz,
                          pop_kreis
                       FROM
                          pop_kreis
                         WHERE
                            date = '", year, "-12-31') pop
                    ON
                      stations.kreis_kz = pop.kreis_kz
                    LEFT JOIN
                      (SELECT
                          kreis_kz,
                          kfz,
                          lkw
                       FROM
                          car_numbers
                         WHERE
                            date = '", year, "-01-01') cars
                    ON
                      stations.kreis_kz = cars.kreis_kz
                    ORDER BY
                            kreis_kz ASC;")

  stations_county <- dbGetQuery(dbcon, query_county) %>%
    as_tibble()
  
  # join map data and queried data
  county_shp <- county_shp_in
  county_shp@data <- county_shp_in@data %>%
    left_join(stations_county, by = c("AGS" = "kreis_kz"))
  
  # map number of stations
  map_kreise_count <- tm_shape(shp = county_shp, name = "Anzahl Tankstellen") + 
    tm_polygons("count",
                style = "jenks",
                palette = "Blues", n = 6, contrast = c(0.4, 0.95),
                border.col = "white",
                border.alpha = 0.5,
                title = "",
                id = "GEN",
                legend.format = list(digits = 0),
                showNA = F,
                colorNA = "grey90",
                popup.vars = c("Number stations:" = "count"),
                popup.format = list(digits = 0)) +
    tm_layout(title = "Anzahl Tankstellen",
              title.size = 1.1,
              title.position = c("center","top"),
              inner.margins = c(0.05,0.10,0.10,0.10),
              legend.position = c("left","bottom"),
              frame  = F) +
    tm_scale_bar(color.dark = "gray70",
                 position = c("right","bottom"))
  
  # map number of station per square kilometer
  map_kreise_area <- tm_shape(shp = county_shp, name = "Tankstellen pro Quadratkilometer") + 
    tm_polygons("a_dens",
                style = "jenks",
                palette = "Oranges", n = 6, contrast = c(0.4, 0.95),
                border.col = "white",
                border.alpha = 0.5,
                title = "",
                id = "GEN",
                legend.format = list(digits = 2),
                showNA = F,
                colorNA = "grey90",
                popup.vars = c("Number stations:" = "a_dens"),
                popup.format = list(digits = 2)) +
    tm_layout(title = "Tankstellen pro Quadratkilometer",
              title.size = 1.1,
              title.position = c("center","top"),
              inner.margins = c(0.05,0.10,0.10,0.10),
              legend.position = c("left","bottom"),
              frame  = F) +
    tm_scale_bar(color.dark = "gray70",
                 position = c("right","bottom"))
  
    # map number of stations per 1.000 population
    map_kreise_pop <- tm_shape(shp = county_shp, name = "Tankstellen pro 1.000 Einwohner") + 
    tm_polygons("p_dens",
                style = "jenks",
                palette = "Greens", n = 6, contrast = c(0.4, 0.95),
                border.col = "white",
                border.alpha = 0.5,
                title = "",
                id = "GEN",
                legend.format = list(digits = 2),
                showNA = F,
                colorNA = "grey90",
                popup.vars = c("Number stations:" = "p_dens"),
                popup.format = list(digits = 2)) +
    tm_layout(title = "Tankstellen pro 1.000 Einwohner",
              title.size = 1.1,
              title.position = c("center","top"),
              inner.margins = c(0.05,0.10,0.10,0.10),
              legend.position = c("left","bottom"),
              frame  = F) +
    tm_scale_bar(color.dark = "gray70",
                 position = c("right","bottom"))
  
  # map number of stations per 1.000 cars
  map_kreise_cars <- tm_shape(shp = county_shp, name = "Tankstellen pro 1.000 zugelassene PKW") + 
    tm_polygons("kfz_dens",
                style = "jenks",
                palette = "PuRd", n = 6, contrast = c(0.4, 0.95),
                border.col = "white",
                border.alpha = 0.5,
                title = "",
                id = "GEN",
                legend.format = list(digits = 2),
                showNA = F,
                colorNA = "grey90",
                popup.vars = c("Number stations:" = "kfz_dens"),
                popup.format = list(digits = 2)) +
    tm_layout(title = "Tankstellen pro 1.000 zugelassene PKW",
              title.size = 1.1,
              title.position = c("center","top"),
              inner.margins = c(0.05,0.10,0.10,0.10),
              legend.position = c("left","bottom"),
              frame  = F) +
    tm_scale_bar(color.dark = "gray70",
                 position = c("right","bottom"))
  
  # arrange maps
  tmap_arrange(map_kreise_count, map_kreise_area, map_kreise_pop, map_kreise_cars,
               ncol = 2, nrow = 2)

nach Marken

Die Marken der einzelnen Tankstellen sind im Datensatz von Tankerkönig nicht immer eindeutig benannt. Die Zuordnung der Tankstellen zu den jeweiligen Marken erfolgte daher mittels automatischer Textmustererkennung. Daher sind gewisse Fehlzuordnungen nicht auszuschließen. Ein stichprobenartiger Vergleich zeigt jedoch, dass die automatische Zuordnung nahe an der Realität liegt.

  # query brands in dataset
  query_brands <- paste0("
                    SELECT
                      distinct(brand_cat),
                      count(*)
                    FROM
                      stations_test
                    WHERE
                      uuid = ANY('{",act_stations_string ,"}')
                    GROUP BY
                      brand_cat;")
  
  # tidy data, assign brand colors, calc shares
  brands <- dbGetQuery(dbcon, query_brands) %>%
    as_tibble() %>%
    arrange(brand_cat) %>%
    add_column(color = c("yellow","blue","blueviolet","orange",
                         "lightblue","green","gold","deepskyblue",
                         "darkblue","lightgreen","red","grey",
                         "hotpink","white")) %>%
    arrange(desc(count)) %>%
    mutate(brand_cat = factor(brand_cat, brand_cat)) %>%
    mutate(ypos = sum(count) - cumsum(count) + count - 0.5*count) %>%
    mutate(share = count/sum(count)*100)
  
  # plot brand shares
  ggplot(data = brands, mapping = aes(x = "", y = count, fill = brand_cat)) +
      geom_bar(stat = "identity",
               color = "black",
               show.legend = T,
               width = 1) +
      labs(x = NULL, y = NULL) +
      scale_fill_manual(values = brands$color, name = "", 
                        labels = paste(brands$brand_cat, " - ", brands$count, " (", round(brands$share,0), "%)", sep = ""),
                        guide = guide_legend(ncol = 1)) +
      coord_polar(theta = "y", start = 0) + 
      geom_text(aes(y = ypos, label = brand_cat, x = 1.75), 
                check_overlap = T) + 
      theme_base() +
      theme(panel.grid.major.x = element_blank(),
          axis.text = element_blank(),
          legend.position = "right")